stage1, standardize institution and country

68c6acc7 · jiangdongchen · 3e83f4af · 68c6acc7 · 68c6acc7 · 68c6acc7
Commit 68c6acc7 authored May 08, 2025 by jiangdongchen
19 changed files
--- a/README.md
+++ b/README.md
@@ -44,19 +44,21 @@
 - TODO:多模型交叉验证

 # 需求与解决方案
-1. 下载论文pdf
+1. TODO:下载论文pdf
    1. 常用网站agent下载
    2. 输出无法下载的条目
-2. 自动化提取信息和格式化
+2. stage1:自动化提取信息和格式化
    1. 通过config.json读取配置对象
    2. **遍历**excel的sheet
        1. **遍历**sheet中的论文名称和索引
            1. 用**大模型**读取pdf中第一页的论文名称和关键信息，存储到json文件夹下
            2. **遍历**excel表格中的论文名称进行模糊匹配, 匹配成功后
-                1. 用pdf文件中的论文名称和索引标准化重命名pdf文件和excel表格中的论文名称
-                2. 将pdf文件中的关键信息写入json文件中进行保存, 包括 标题 会议名称 作者姓名 机构 国家.
+                1. 将pdf文件中的关键信息写入json文件中进行保存, 包括 标题 会议名称 作者姓名 机构 国家.
+                2. 用pdf文件中的论文名称和索引标准化重命名pdf文件和excel表格中的论文标题、会议名称、作者姓名、机构、国家.
                3. 将pdf文件中的会议或者期刊名称和CCFA的会议或者期刊名称的表格交给**大模型**匹配,匹配结果以“是/否”的形式写入目标excel表格中.
            2. 匹配失败后，输出无法匹配的条目,使用warning记录无法匹配的条目，方便后续处理.
+    3. 得到从pdf中提取的信息和格式化的excel表格.
+3. stage2: 国家机构索引、牛人判断

 # 代码结构说明
 1. psrc文件夹下是库函数

--- a/logs/citation_process.log
+++ b/logs/citation_process.log
@@ -892,3 +892,34 @@
 2025-05-08 16:19:23,091 - INFO - HTTP Request: POST https://api.siliconflow.cn/v1/chat/completions "HTTP/1.1 200 OK"
 2025-05-08 16:19:23,093 - INFO - Renamed: 1-In-Datacenter_Performance_Analysis_of_a_Tensor_Processing_Unit.pdf -> 1-In-Datacenter_Performance_Analysis_of_a_Tensor_Processing_Unit.pdf
 2025-05-08 16:19:30,059 - INFO - HTTP Request: POST https://api.siliconflow.cn/v1/chat/completions "HTTP/1.1 200 OK"
+2025-05-08 16:49:01,087 - INFO - 程序启动，日志文件保存在: C:\Users\17046\Documents\papertools\logs\citation_process.log
+2025-05-08 16:49:01,617 - INFO - [94mProcessing sheet: j24-DianNao family[0m
+2025-05-08 16:49:01,618 - INFO - [94mProcessing 1-In-Datacenter_Performance_Analysis_of_a_Tensor_Processing_Unit.pdf[0m
+2025-05-08 16:49:58,022 - INFO - HTTP Request: POST https://api.siliconflow.cn/v1/chat/completions "HTTP/1.1 200 OK"
+2025-05-08 16:51:38,223 - INFO - 程序启动，日志文件保存在: C:\Users\17046\Documents\papertools\logs\citation_process.log
+2025-05-08 16:51:38,759 - INFO - [94mProcessing sheet: j24-DianNao family[0m
+2025-05-08 16:51:38,760 - INFO - [94mProcessing 1-In-Datacenter_Performance_Analysis_of_a_Tensor_Processing_Unit.pdf[0m
+2025-05-08 16:52:58,406 - INFO - 程序启动，日志文件保存在: C:\Users\17046\Documents\papertools\logs\citation_process.log
+2025-05-08 16:52:58,916 - INFO - [94mProcessing sheet: j24-DianNao family[0m
+2025-05-08 16:52:58,918 - INFO - [94mProcessing 1-In-Datacenter_Performance_Analysis_of_a_Tensor_Processing_Unit.pdf[0m
+2025-05-08 16:57:39,373 - INFO - 程序启动，日志文件保存在: C:\Users\17046\Documents\papertools\logs\citation_process.log
+2025-05-08 16:57:39,871 - INFO - [94mProcessing sheet: j24-DianNao family[0m
+2025-05-08 16:57:39,873 - INFO - [94mProcessing 1-In-Datacenter_Performance_Analysis_of_a_Tensor_Processing_Unit.pdf[0m
+2025-05-08 16:58:33,162 - INFO - HTTP Request: POST https://api.siliconflow.cn/v1/chat/completions "HTTP/1.1 200 OK"
+2025-05-08 16:58:33,167 - INFO - Renamed: 1-In-Datacenter_Performance_Analysis_of_a_Tensor_Processing_Unit.pdf -> 1-In-Datacenter_Performance_Analysis_of_a_Tensor_Processing_Unit.pdf
+2025-05-08 17:01:09,552 - INFO - 程序启动，日志文件保存在: C:\Users\17046\Documents\papertools\logs\citation_process.log
+2025-05-08 17:01:10,069 - INFO - [94mProcessing sheet: j24-DianNao family[0m
+2025-05-08 17:01:10,071 - INFO - [94mProcessing 1-In-Datacenter_Performance_Analysis_of_a_Tensor_Processing_Unit.pdf[0m
+2025-05-08 17:02:06,639 - INFO - HTTP Request: POST https://api.siliconflow.cn/v1/chat/completions "HTTP/1.1 200 OK"
+2025-05-08 17:02:06,642 - INFO - Renamed: 1-In-Datacenter_Performance_Analysis_of_a_Tensor_Processing_Unit.pdf -> 1-In-Datacenter_Performance_Analysis_of_a_Tensor_Processing_Unit.pdf
+2025-05-08 17:02:06,642 - INFO - Judge CCFA.
+2025-05-08 17:02:14,984 - INFO - HTTP Request: POST https://api.siliconflow.cn/v1/chat/completions "HTTP/1.1 200 OK"
+2025-05-08 17:02:15,690 - INFO - Matched: 1-In-Datacenter_Performance_Analysis_of_a_Tensor_Processing_Unit.pdf -> idx: 1, excel_name: In-datacenter performance analysis of a tensor processing unit
+2025-05-08 17:02:15,690 - INFO - Change: 1-In-Datacenter_Performance_Analysis_of_a_Tensor_Processing_Unit.pdf -> 1-In-Datacenter_Performance_Analysis_of_a_Tensor_Processing_Unit.pdf
+2025-05-08 17:02:15,690 - INFO - [94mProcessing 10-A_Configurable_Cloud-Scale_DNN_Processor_for_Real-Time_AI.pdf[0m
+2025-05-08 17:02:36,531 - INFO - HTTP Request: POST https://api.siliconflow.cn/v1/chat/completions "HTTP/1.1 200 OK"
+2025-05-08 17:02:36,533 - INFO - Renamed: 10-A_Configurable_Cloud-Scale_DNN_Processor_for_Real-Time_AI.pdf -> 10-A_Configurable_Cloud-Scale_DNN_Processor_for_Real-Time_AI.pdf
+2025-05-08 17:02:36,533 - INFO - Judge CCFA.
+2025-05-08 17:08:08,688 - INFO - 程序启动，日志文件保存在: C:\Users\17046\Documents\papertools\logs\citation_process.log
+2025-05-08 17:08:09,207 - INFO - [94mProcessing sheet: j24-DianNao family[0m
+2025-05-08 17:08:09,209 - INFO - [94mProcessing 1-In-Datacenter_Performance_Analysis_of_a_Tensor_Processing_Unit.pdf[0m
--- a/main.py
+++ b/main.py
 import json
 import logging
-import psrc.citationProcess as CP
+import psrc.stage1.citationProcess as CP
 from pathlib import Path

 if __name__ == "__main__":

--- a/others/target.xlsx
+++ b/others/target.xlsx
--- a/psrc/checkCCFA.py
+++ b/psrc/checkCCFA.py
-from openai import OpenAI
-from pathlib import Path
-import json
-import openpyxl
-
-def chechCCFA( conferenceJournal, CCFA, configModel, client):
-    system_prompt = f"""
-    You are an expert academic conference/journal classifier. Your task is to determine if the given conference/journal name matches any entry in the provided CCF-A list.
-    CCF-A List (comma-separated): {CCFA}
-    Analysis Guidelines:
-    1. Perform fuzzy matching considering:
-       - Abbreviations vs full names (e.g. 'PPoPP' vs 'ACM SIGPLAN Symposium on Principles & Practice of Parallel Programming')
-       - Common variations (e.g. 'IEEE Transactions' vs 'IEEE Trans.')
-       - Minor spelling differences
-    2. Return JSON with:
-       - "IsCCFA": ture/false
-       - "MatchedName": the matched name from CCF-A list (empty string if no match)
-       - "Confidence": your confidence score (0-100)
-    Example Output:
-    {{
-        "IsCCFA": "ture",
-        "MatchedName": "IEEE International Symposium on High Performance Computer Architecture",
-        "Confidence": 0.95,
-        "Reason": "The input matches HPCA's full name"
-    }}
-    """
-
-    response = client.chat.completions.create(  
-        model=configModel,  
-        messages=[  
-            {"role": "system", "content": system_prompt},
-            {"role": "user", "content": conferenceJournal},
-        ],  
-        temperature=0.2,  
-        max_tokens=4096,
-        # stream=True,
-        response_format={"type": "json_object"}  
-    ) 
-
-    return response.choices[0].message.content
-
-
-if __name__ == "__main__":
-    cwd_dir = Path.cwd()
-
-    # 构建 config.json 的完整路径
-    config_path = (cwd_dir / "config.json").resolve()
-
-    # 读取config.json中的配置参数
-    with open( config_path, 'r', encoding='utf-8') as f:
-        config = json.load(f)
-
-    client = OpenAI(api_key=config["api_key"], base_url=config["base_url"])
-    configModel = config["model"]
-    ccfa_excel_path = Path(config["ccfa_excel_path"])
-    wb = openpyxl.load_workbook(ccfa_excel_path)
-    sheetCCF = wb["CCF-A列表"]
-    # 序号	简称	全称
-    # 1	PPoPP	ACM SIGPLAN Symposium on Principles & Practice of Parallel Programming
-    # 2	FAST	USENIX Conference on File and Storage Technologies
-    # 3	DAC	Design Automation Conference
-    # 4	HPCA	IEEE International Symposium on High Performance Computer Architecture
-    # 5	MICRO	IEEE/ACM International Symposium on Microarchitecture
-    CCFA_list = []
-    for row in sheetCCF.iter_rows(min_row=2, values_only=True): # 从第二行开始遍历
-        if row[0] and row[1]: # 确保索引和论文名称都存在
-            CCFA_list.append(row[1])
-            CCFA_list.append(row[2])
-    # 把list转为长的字符串, ','分割
-    CCFA = ','.join(CCFA_list)
-    conferenceJournal = "IEEE Journal of Solid-State Circuits"
-    result = chechCCFA(conferenceJournal, CCFA, configModel, client)
-    print(result)
\ No newline at end of file
--- a/psrc/stage1/__pycache__/citationProcess.cpython-312.pyc
+++ b/psrc/stage1/__pycache__/citationProcess.cpython-312.pyc
--- a/psrc/citationProcess.py
+++ b/psrc/citationProcess.py
 from pathlib import Path
 import logging
-from xml.etree.ElementPath import get_parent_map
 from openai import OpenAI
 import pypdf
 import openpyxl
@@ -64,6 +63,11 @@ def get_key_info( content, configModel, client):
        -   Extract all associated institutions of authors.
    -   **Countrys:**
        -   Extract all associated countrys of authors.
+        -   Try to use full names. Only write the names of universities/companies, excluding departments, postal codes, countries, house numbers, cities, etc.
+        -   For foreign names: Translate them first, and then manually check them using Google (mark them).
+        -   In cases like the Chinese Academy of Sciences, specify the institute level. For example, the Institute of Computing Technology, Chinese Academy of Sciences.
+        -   In cases of branch campuses, specify the branch. For example, California State University, University of California, University of Maryland.
+
    -   **ISSUE:**
        -   Extract where the paper is published like journal or session.
    -   Title, authors, institutions and countrys should be four separate keys, not nested together.
@@ -91,7 +95,7 @@ def get_key_info( content, configModel, client):
            "United States",
            "China",
        ],
-        "ISSURE": [
+        "ISSUE": [
            "IEEE Transactions on Computers" 
        ]
    }
@@ -251,6 +255,14 @@ def citationProcess(config: dict):
                        authors = ";".join(authors_list) if isinstance(authors_list, list) else ""
                        sheet.cell(row=idx+4, column=7, value=authors)  # 第7列是作者名称

+                        institution_list = result_dict.get("Institutions", [])
+                        institutions = ";".join(institution_list) if isinstance( institution_list, list) else ""
+                        sheet.cell(row=idx+4, column=11, value=institutions)  # 第9列是机构
+
+                        countrys_list = result_dict.get("Countrys", [])
+                        countrys = ";".join(countrys_list) if isinstance(countrys_list, list) else ""
+                        sheet.cell(row=idx+4, column=11, value=countrys)  # 第11列是国家
+
                        # CCFA判断
                        logging.info(f"Judge CCFA.")
                        CCFA_list = []
@@ -260,9 +272,13 @@ def citationProcess(config: dict):
                                CCFA_list.append(row[2])
                        # 把list转为长的字符串, ','分割
                        CCFA = ','.join(CCFA_list)
-                        conferenceJournal = pdf_issue[0]
+                        if pdf_issue:
+                            conferenceJournal = pdf_issue[0]
+                        else:
+                            logging.warning(f"{RED}No conference/journal information found for {file.name}{RESET}")
+                            conferenceJournal = ""
+
                        CCFA_flag = "否"
-                        print(conferenceJournal)
                        if conferenceJournal == "":
                            CCFA_flag = "否"
                        else:

--- a/add-niuren-location-zhimingqiye/3-find_celebrity_from_paper_with_authors.py
+++ b/add-niuren-location-zhimingqiye/3-find_celebrity_from_paper_with_authors.py
--- a/add-niuren-location-zhimingqiye/4-papers_with_niuren-final-to_csv.py
+++ b/add-niuren-location-zhimingqiye/4-papers_with_niuren-final-to_csv.py
--- a/add-niuren-location-zhimingqiye/5-find_location_of_papers_with_niuren.py
+++ b/add-niuren-location-zhimingqiye/5-find_location_of_papers_with_niuren.py
--- a/add-niuren-location-zhimingqiye/6-zhimingqiye.py
+++ b/add-niuren-location-zhimingqiye/6-zhimingqiye.py
--- a/add-niuren-location-zhimingqiye/8-add_country_hpca.py
+++ b/add-niuren-location-zhimingqiye/8-add_country_hpca.py
--- a/add-niuren-location-zhimingqiye/good_papers.jsonl
+++ b/add-niuren-location-zhimingqiye/good_papers.jsonl
--- a/add-niuren-location-zhimingqiye/hpca.xlsx
+++ b/add-niuren-location-zhimingqiye/hpca.xlsx
--- a/add-niuren-location-zhimingqiye/new_niuren_format-merged_turing.csv
+++ b/add-niuren-location-zhimingqiye/new_niuren_format-merged_turing.csv
--- a/add-niuren-location-zhimingqiye/readme.md
+++ b/add-niuren-location-zhimingqiye/readme.md
--- a/add-niuren-location-zhimingqiye/utils.py
+++ b/add-niuren-location-zhimingqiye/utils.py
--- a/add-niuren-location-zhimingqiye/机构国家汇总-初版.xlsx
+++ b/add-niuren-location-zhimingqiye/机构国家汇总-初版.xlsx
--- a/add-niuren-location-zhimingqiye/知名企业.xlsx
+++ b/add-niuren-location-zhimingqiye/知名企业.xlsx