Commit 91ab82b3 by Pengwei-Jin

Merge branch 'master' of http://62.234.201.16/nzy/papertools

parents 34bba471 382ba375
.vscode/
Papers/
psrc/__pycache__/
psrc/stage1/__pycache__/
psrc/stage2/__pycache__/
json/
\ No newline at end of file
# 环境配置
- 确保执行py的cwd在papertools仓库文件夹下
- 路径和参数配置都在config.json文件中
- api_key
- 目前的密钥是东辰同学自己从知乎上打广告赚来的,只有100块的额度,请尽量使用自己的密钥
- 如果使用不同的API的密钥注意更改openAI的调用方式,这里推荐硅基流动,因为我就是用硅基流动跑通的
- base_url
- api接口url
- pdf_dir
- 放置论文pdf的文件夹
- result_dir
- 输出关键信息json文件的文件夹
- source_excel_path
- 放置需要check的excel表格
- 第context_start+1行开始实际表项
- 第一列索引
- 第三列论文标题
- 第七列论文作者
- target_excel_path
- 输出的格式化表格
- ccfa_excel_path
- CCFA的参考表格
- logLevel
- 取10表示DEBUG级别
- 取20表示INFO级别
- sheetNum 需要处理的工作表数量
- maxItem 每个工作表的最大条目数
- python3.12
- 无法import的库使用pip install逐个安装
- `openai`, `pypdf`
- `python-Levenshtein`
# 使用方法
- 将excel表的第三行改为如下图所示包含被引用文章标题的bibtex格式
- ![](./others/bibtex.png)
- 查看config.json正确配置参数,让程序能够找到需要的文件位置和参数
- 默认配置
- 文章的pdf分sheet放置在Papers/sheetname文件夹下
- 待check的excel表格放在others文件夹中
- 输出的表格放在target文件夹中, pdf会原地标准化重命名
- python main.py 执行程序
- 程序执行过程中,不要打开target excel文件,不然会争用权限发生错误
- 成功后的日志样例在logs文件夹下
- 断点处理:如果在excel某个序号之前的pdf都正确提取了信息,并且正确修改了excel,下一个序号开始的pdf出错了
- 建议将正确的pdf都转移到其他文件夹,这样再次运行脚本将处理剩下的pdf
- TODO:多模型交叉验证
- TODO:Temperature的设置
# 需求与解决方案
1. TODO:下载论文pdf
1. 常用网站agent下载
2. 输出无法下载的条目
2. stage1:自动化提取信息和格式化
1. 通过config.json读取配置对象
2. **遍历**excel的sheet
1. **遍历**sheet中的论文名称和索引
1.**大模型**读取pdf中第一页的论文名称和关键信息,存储到json文件夹下
2. 读取pdf中从后向前的引用信息, 通过**大模型**找出sheetname对应文章在当前pdf文章中的索引,存储到json文件夹下
3. **遍历**excel表格中的论文名称进行模糊匹配, 匹配成功后
1. 将pdf文件中的关键信息写入json文件中进行保存, 包括 标题 会议名称 作者姓名 机构 国家.
2. 用pdf文件中的论文名称和索引标准化重命名pdf文件和excel表格中的论文标题、会议名称、作者姓名、机构、国家.
3. 首先用**大模型**将英文国家名翻译成中国名,将国家对应的索引写入目标excel表格中.
4. 将pdf文件中的会议或者期刊名称和CCFA的会议或者期刊名称的表格交给**大模型**匹配,匹配结果以“是/否”的形式写入目标excel表格中.
4. 匹配失败后,输出无法匹配的条目,使用warning记录无法匹配的条目,方便后续处理.
3. 得到从pdf中提取的信息和格式化的excel表格.
4. 人工复核
1. 可能会有重复的pdf,只有标准化重命名之后才会发现,当两个pdf的索引名称相同时,说明pdf重复匹配,需要手动删除重复的pdf和excel表项,由人工结合原表格信息判定重复现象.
2. 可能机构和国家会有重复,请人工检查.
3. 读log查红色warning信息
3. stage2: 知名企业、牛人判断
# 代码结构说明
1. psrc文件夹下是库函数
2. config.json是配置文件
3. main.py是主程序
4. logs文件夹是日志文件
5. json文件夹是关键信息json文件
\ No newline at end of file
{
"api_key": "sk-otamesebhzzycgfynnssjkrkjlcoitdtstcruwbhohksdlel",
"base_url": "https://api.siliconflow.cn/v1",
"model": "Pro/deepseek-ai/DeepSeek-V3",
"pdf_dir": "./Papers",
"result_dir": "./json",
"source_excel_path": "./others/论文被引用情况-陈老师-2025.05.01.xlsx",
"content_start": 4,
"ccfa_excel_path": "./others/CCFA.xlsx",
"target_excel_path": "./others/target.xlsx",
"logLevel": 20,
"sheetNum": 1,
"maxItem": 333
}
\ No newline at end of file
This source diff could not be displayed because it is too large. You can view the blob instead.
import json
import logging
import psrc.stage1.citationProcess as CP
from pathlib import Path
if __name__ == "__main__":
cwd_dir = Path.cwd()
# 构建 config.json 的完整路径
config_path = (cwd_dir / "config.json").resolve()
# 读取config.json中的配置参数
with open( config_path, 'r', encoding='utf-8') as f:
config = json.load(f)
# Path对象后跟/用于连接地址
# 创建日志目录
log_dir = cwd_dir / "logs"
log_dir.mkdir(exist_ok=True)
# 配置日志系统
log_file = log_dir / "citation_process.log"
logLevel = config["logLevel"]
# logging.basicConfig(...) 是 Python 标准库 logging 模块中的一个函数,用于快速配置日志记录的基本设置.
# 设置日志记录的最低级别为 INFO, 只有日志级别大于等于 INFO 的日志记录才会被处理(例如 INFO、WARNING、ERROR、CRITICAL).
# logging.debug("这是一个调试信息") # 不会输出(低于 INFO)
# logging.info("这是一个普通信息") # 会输出
logging.basicConfig(
# %(asctime)s:日志记录的时间戳(默认格式:YYYY-MM-DD HH:MM:SS)。
# %(levelname)s:日志级别名称(如 INFO, WARNING)。
# %(message)s:日志的具体内容。
level=logLevel, format="%(asctime)s - %(levelname)s - %(message)s",
handlers=[
logging.FileHandler(log_file, encoding='utf-8'),
logging.StreamHandler()
]
)
logging.info(f"程序启动,日志文件保存在: {log_file}")
CP.citationProcess(config)
from errno import ESTALE
from pathlib import Path
import logging
from openai import OpenAI
import pypdf
import openpyxl
from fuzzywuzzy import fuzz
import json
import re
import bibtexparser
import psrc.stage1.country_to_idx as c2i
RED = '\033[91m'
GREEN = '\033[92m'
BLUE = '\033[94m'
RESET = '\033[0m'
def chechCCFA( conferenceJournal, CCFA, configModel, client):
system_prompt = f"""
You are an expert academic conference/journal classifier. Your task is to determine if the given conference/journal name matches any entry in the provided CCF-A list.
CCF-A List (comma-separated): {CCFA}
Analysis Guidelines:
1. Perform fuzzy matching considering:
- Abbreviations vs full names (e.g. 'PPoPP' vs 'ACM SIGPLAN Symposium on Principles & Practice of Parallel Programming')
- Common variations (e.g. 'IEEE Transactions' vs 'IEEE Trans.')
- Minor spelling differences
2. Return JSON with:
- "IsCCFA": ture/false
- "MatchedName": the matched name from CCF-A list (empty string if no match)
- "Confidence": your confidence score (0-100)
Example Output:
{{
"IsCCFA": "ture",
"MatchedName": "IEEE International Symposium on High Performance Computer Architecture",
"Confidence": 0.95,
"Reason": "The input matches HPCA's full name"
}}
"""
response = client.chat.completions.create(
model=configModel,
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": conferenceJournal},
],
temperature=0.2,
max_tokens=4096,
# stream=True,
response_format={"type": "json_object"}
)
return response.choices[0].message.content
def get_key_info( content, configModel, client):
system_prompt = """
Act as an expert metadata extraction assistant.
Analyze the following text, which is extracted from the first page of a document (likely a scientific paper or report).
Your goal is to extract the document title, all authors, and their corresponding affiliations.
Extraction Guidelines:
- **Title:** Extract the main title of the document. If ambiguous or missing, use "".
- **Authors:**
- Identify all listed authors. Maintain the order presented in the text if possible.
- Extract their full name as accurately as possible. Use "" if a name cannot be clearly identified for an entry.
- **Institutions:**
- Extract all associated institutions of authors
- 相同机构仅保留一个
- **Countrys:**
- Extract all associated countrys of institutions.
- Try to use full names. Only write the names of universities/companies, excluding departments, postal codes, countries, house numbers, cities, etc.
- For foreign names: Translate them first, and then manually check them using Google (mark them).
- In cases like the Chinese Academy of Sciences, specify the institute level. For example, the Institute of Computing Technology, Chinese Academy of Sciences.
- In cases of branch campuses, specify the branch. For example, California State University, University of California, University of Maryland.
- **ISSUE:**
- Extract where the paper is published like journal or session.
- Title, authors, institutions and countrys should be four separate keys, not nested together.
- Use highcase for first letter of key.
- **Handling Missing Data:** If no data of a field can be identified in the text, the field in the JSON should be an empty list `[]`.
Example Output:
{
"Title": "Laius: Towards Latency Awareness and Improved Utilization of Spatial Multitasking Accelerators in Datacenters",
"Authors": [
"Quan Chen",
"Daniel Edward Mawhirter",
"Bo Wu",
"Chao Li",
],
"Institutions": [
"Shanghai Jiao Tong University",
"Colorado School of Mines",
],
"Countrys": [
"China",
"United States",
],
"ISSUE": [
"IEEE Transactions on Computers"
]
}
"""
response = client.chat.completions.create(
model=configModel,
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": content},
],
temperature=0.25,
max_tokens=4096,
# stream=True,
response_format={"type": "json_object"}
)
logging.debug(response.choices[0].message.content)
return response.choices[0].message.content
# for chunk in response:
# if not chunk.choices:
# continue
# if chunk.choices[0].delta.content:
# # 增量输出返回值
# print(chunk.choices[0].delta.content, end="", flush=True) # 不换行刷新输出,流式输出
# Extracts text content from the first page of a PDF.
def extract_first_page_text(pdf_path):
try:
# 尝试读取 PDF 文件
reader = pypdf.PdfReader(pdf_path)
if len(reader.pages) > 0:
first_page = reader.pages[0]
text = first_page.extract_text()
if text:
# 2. text.split():用默认空白符(空格/换行/制表符)分割字符串,自动合并连续空白
# 3. " ".join(...):用单个空格连接,实现「多余空白→单空格」的清理效果
cleaned_text = " ".join(text.split())
cleaned_text = cleaned_text.encode("utf-8", errors="replace").decode(
"utf-8"
)
return cleaned_text
else:
logging.warning(f"{RED}No text found on the first page of {pdf_path.name}{RESET}")
return None
else:
logging.warning(f"{RED}PDF has no pages: {pdf_path.name}{RESET}")
return None
except Exception as e:
# 捕获并记录读取 PDF 文件时的异常
logging.warning(f"{RED}Failed to read PDF {pdf_path.name}: {str(e)}{RESET}")
return None
def get_citation_ids(pdf_path, title, configModel, client):
try:
reader = pypdf.PdfReader(pdf_path)
if len(reader.pages) == 0:
logging.warning(f"PDF has no pages: {pdf_path.name}")
return None
system_prompt = f"""
你是一个专业的学术论文引用分析助手。你的任务是从PDF页面文本中找出引用给定论文的引用编号。
论文标题: {title}
分析指南:
1. 从文本中查找类似 [1], [2] 这样的引用标记
2. 引用标记后面通常会跟着引用的论文标题
3. 如果找到匹配的引用编号,返回该数字
4. 如果没有找到匹配的引用,返回空字符串
请直接返回引用编号数字,不要返回其他内容。
"""
# 从最后一页开始逐页检查
for page_num in range(len(reader.pages) - 1, -1, -1):
page = reader.pages[page_num]
text = page.extract_text()
if text:
cleaned_text = " ".join(text.split())
response = client.chat.completions.create(
model=configModel,
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": cleaned_text},
],
temperature=0.1,
max_tokens=10,
response_format={"type": "text"}
)
result = response.choices[0].message.content.strip()
if result.isdigit():
return result
logging.warning(f"Citation ID for {title} not found in {pdf_path.name}")
return None
except Exception as e:
logging.error(f"An error occurred while processing {pdf_path.name}: {str(e)}")
return None
# excel表格的第4行开始读取索引和论文名称
def read_rough_nameIndex_from_excel(sheet, maxItem):
index_list = []
paperName_list = []
# 从第4行开始遍历
for idx, row in enumerate(sheet.iter_rows(min_row=4, values_only=True)):
if idx >= maxItem: # 限制读取的行数
break
if row[0] and row[2]: # 确保索引和论文名称都存在
index_list.append(row[0])
paperName_list.append(row[2])
return index_list, paperName_list
def translate_countries(countries_str: str, configModel, client):
system_prompt = """
你是一个专业的翻译助手,请将输入的英文国家名称翻译为中文。
输入可能是多个国家名称,以英文分号分隔。
输出同样以英文分号分隔对应的中文国家名称。
示例输入: United States; China
示例输出: 美国; 中国
"""
response = client.chat.completions.create(
model=configModel,
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": countries_str}
],
temperature=0.2,
max_tokens=4096,
response_format={"type": "text"}
)
return response.choices[0].message.content.strip()
def citationProcess(config: dict):
client = OpenAI(api_key=config["api_key"],
base_url=config["base_url"])
excel_path = Path(config["source_excel_path"])
target_path = Path(config["target_excel_path"])
ccfa_excel_path = Path(config["ccfa_excel_path"])
# 读取Excel文件
wb = openpyxl.load_workbook(excel_path)
# 读取CCFA列表
ccfa_wb = openpyxl.load_workbook(ccfa_excel_path)
sheetCCF = ccfa_wb["CCF-A列表"]
# 遍历工作簿中的所有工作表
for idx, sheet_name in enumerate(wb.sheetnames):
if idx == config["sheetNum"]:
break
sheet = wb[sheet_name]
logging.info(f"{BLUE}Processing sheet: {sheet_name}{RESET}")
index_list, paperName_list = read_rough_nameIndex_from_excel(sheet, config["maxItem"])
rst_dir = Path.cwd() / config["result_dir"] / sheet_name
rst_dir.mkdir(parents=True, exist_ok=True) # 确保结果目录存在
pdf_directory = Path.cwd() / config["pdf_dir"] / sheet_name
pdf_files = pdf_directory.rglob("*.pdf") # 递归搜索, 输出所有pdf文件的路径
# 读取第三行的BibTeX文本
bibtex_row = sheet[3] # 第三行,索引从1开始
# 使用 map 函数将 cell.value 转换为字符串类型
bibtex_text = "".join(map(lambda cell: str(cell.value) if cell.value is not None else "", bibtex_row))
# 解析BibTeX文本
try:
bib_database = bibtexparser.loads(bibtex_text)
if bib_database.entries:
entry = bib_database.entries[0]
cited_title_str = entry.get('title', '')
else:
cited_title_str = ""
logging.warning(f"No BibTeX entry found in sheet {sheet_name} row 3")
except Exception as e:
cited_title_str = ""
logging.error(f"Error parsing BibTeX in sheet {sheet_name} row 3: {str(e)}")
# 遍历当前工作表对应的所有PDF文件
for file in pdf_files:
logging.info(f"{BLUE}Processing {file.name}{RESET}")
first_page_text = extract_first_page_text(file)
if first_page_text is None:
logging.error(f"Failed to extract text from first page of {file.name}")
continue # 跳过当前文件继续处理下一个
configModel = config["model"]
# 提取关键信息
result = get_key_info(first_page_text, configModel, client)
cit_id = get_citation_ids(file, cited_title_str, configModel, client)
if result is not None:
# 解析JSON结果, 提取论文标题
result_dict = json.loads(result)
pdf_title = result_dict["Title"]
pdf_issue = result_dict["ISSUE"]
# 遍历Excel表项进行模糊匹配
matched = False
for idx, excel_name in zip(index_list, paperName_list):
# 预处理字符串
# 返回pdf字符前的字符串,所以加上索引0
clean_excel_name = excel_name.split('.pdf')[0].replace(" ", "").replace("_", "").replace(":", "").replace("-", "")
clean_pdf_title = pdf_title.replace(" ", "").replace("_", "").replace(":", "").replace("-", "")
logging.debug(f"clean_excel_name: {clean_excel_name}")
logging.debug(f"clean_pdf_title: {clean_pdf_title}")
similarity = fuzz.partial_ratio(clean_pdf_title.lower(), clean_excel_name.lower())
if similarity >= 85:
# 重命名PDF文件
# 替换冒号、空格、问号和斜杠,避免文件名非法字符
new_pdf_name = f"{idx}-{pdf_title.replace(': ', '_').replace(' ', '_').replace('?', '_').replace('/', '_')}.pdf" # 将冒号、空格、问号和斜杠替换为下划线
new_pdf_path = file.parent / new_pdf_name
try:
file.rename(new_pdf_path)
logging.info(f"Renamed: {file.name} -> {new_pdf_name}")
except FileExistsError:
logging.warning(f"Renamed failed: filename {new_pdf_name} already exists with idx {idx}.")
break
# 存储关键信息到json文件中
rst_path = rst_dir / (f"{idx}" + ".json")
rst_path.write_text(result + "\n", encoding='utf-8') # 明确指定UTF-8编码
# 更新Excel中的表项
sheet.cell(row=idx+config["content_start"], column=3, value=pdf_title) # 第3列是论文名称
issue = result_dict.get("ISSUE", [])
if issue and isinstance(issue, list) and len(issue) > 0: # 确保issue是有效列表且不为空
sheet.cell(row=idx+config["content_start"], column=4, value=issue[0])
logging.info(f"Standardization issue info.")
else:
logging.warning(f"{RED}Invalid ISSUE data: {issue}{RESET}")
if cit_id is not None:
sheet.cell(row=idx+config["content_start"], column=6, value=cit_id) # 第6列是cit_id
else:
logging.warning(f"{RED}cit_id is None.可能存在多个pdf版本,请找到真正引用的版本(T_T){RESET}")
authors_list = result_dict.get("Authors", [])
authors = ";".join(authors_list) if isinstance(authors_list, list) else ""
sheet.cell(row=idx+config["content_start"], column=7, value=authors) # 第7列是作者名称
logging.info(f"Standardization author info.")
institution_list = result_dict.get("Institutions", [])
institutions = ";".join(institution_list) if isinstance( institution_list, list) else ""
sheet.cell(row=idx+config["content_start"], column=9, value=institutions) # 第9列是机构
logging.info(f"Standardization institution info.")
countrys_list = result_dict.get("Countrys", [])
countrys = ";".join(countrys_list) if isinstance(countrys_list, list) else ""
# 翻译 countrys 为中文
if countrys:
translated_countrys = translate_countries(countrys, config["model"], client)
sheet.cell(row=idx+config["content_start"], column=11, value=translated_countrys) # 第11列是中文国家名称
logging.debug(f"Translated countrys info: {translated_countrys}")
else:
logging.warning(f"{RED}No countrys info to translate.{RESET}")
countrys = c2i.country_to_idx(translated_countrys)
sheet.cell(row=idx+config["content_start"], column=10, value=countrys) # 第10列是国家索引
logging.info(f"Standardization countrys info.")
# CCFA判断
logging.info(f"Judge CCFA.")
CCFA_list = []
for row in sheetCCF.iter_rows(min_row=2, values_only=True): # 从第二行开始遍历
if row[0] and row[1]: # 确保索引和论文名称都存在
CCFA_list.append(row[1])
CCFA_list.append(row[2])
# 把list转为长的字符串, ','分割
CCFA = ','.join(CCFA_list)
if pdf_issue:
conferenceJournal = pdf_issue[0]
else:
logging.warning(f"{RED}LLM没有在文章中找到会议/期刊信息, 默认使用输入Excel中的会议名称, 请人工确认本条的CCFA信息。{file.name}{RESET}")
conferenceJournal = ""
CCFA_flag = "否"
if conferenceJournal == "":
CCFA_flag = "否"
else:
CCFA_flag = "是" if chechCCFA(conferenceJournal, CCFA, configModel, client) else "否"
logging.info(f"{CCFA_flag}")
sheet.cell(row=idx+4, column=5, value=CCFA_flag) # 第7列是作者名称
# 保存修改后的Excel文件
wb.save(target_path)
logging.info(f"Matched: {file.name} -> idx: {idx}, excel_name: {excel_name}")
logging.info(f"Change: {file.name} -> {new_pdf_name}")
matched = True
break
if matched == False:
logging.warning(f"{RED}Not matched: {file.name}{RESET}")
else:
logging.warning(f"{RED}Failed to extract key info from {file.name}{RESET}")
\ No newline at end of file
country_to_id_map = {
"美国": 1,
"中国": 2,
"日本": 3,
"韩国": 4,
"新加坡": 5,
"台湾": 6,
"香港": 7,
"澳门": 8,
"中国台湾": 6,
"中国香港": 7,
"中国澳门": 8,
"法国": 9,
"英国": 10,
"德国": 11,
"意大利": 12,
"西班牙": 13,
"加拿大": 14,
"荷兰": 15,
"印度": 16,
"阿联酋": 17,
"比利时": 18,
"俄罗斯": 19,
"阿富汗": 20,
"亚美尼亚": 21,
"阿塞拜疆": 22,
"巴林": 23,
"孟加拉国": 24,
"不丹": 25,
"文莱": 26,
"缅甸": 27,
"柬埔寨": 28,
"塞浦路斯": 29,
"东帝汶": 30,
"格鲁吉亚": 31,
"印度尼西亚": 32,
"伊朗": 33,
"伊拉克": 34,
"以色列": 35,
"约旦": 36,
"哈萨克斯坦": 37,
"科威特": 38,
"吉尔吉斯斯坦": 39,
"老挝": 40,
"黎巴嫩": 41,
"马来西亚": 42,
"马尔代夫": 43,
"蒙古": 44,
"尼泊尔": 45,
"朝鲜": 46,
"阿曼": 47,
"巴基斯坦": 48,
"巴勒斯坦": 49,
"菲律宾": 50,
"卡塔尔": 51,
"沙特阿拉伯": 52,
"斯里兰卡": 53,
"叙利亚": 54,
"塔吉克斯坦": 55,
"泰国": 56,
"土库曼斯坦": 57,
"乌兹别克斯坦": 58,
"越南": 59,
"也门": 60,
"北塞浦路斯": 61,
"纳戈尔诺-卡拉巴 赫": 62,
"阿尔及利亚": 63,
"安哥拉": 64,
"贝宁": 65,
"博茨瓦纳": 66,
"布基纳法索": 67,
"布隆迪": 68,
"佛得角": 69,
"喀麦隆": 70,
"中非共和国": 71,
"乍得": 72,
"科摩罗": 73,
"刚果": 75,
"科特迪瓦": 76,
"吉布提": 77,
"埃及": 78,
"赤道几内亚": 79,
"厄立特里亚": 80,
"埃塞俄比亚": 81,
"加蓬": 82,
"冈比亚": 83,
"加纳": 84,
"几内亚": 85,
"几内亚比绍": 86,
"肯尼亚": 87,
"莱索托": 88,
"利比里亚": 89,
"利比亚": 90,
"马达加斯加": 91,
"马拉维": 92,
"马里": 93,
"毛里塔尼亚": 94,
"毛里求斯": 95,
"摩洛哥": 96,
"莫桑比克": 97,
"纳米比亚": 98,
"尼日尔": 99,
"尼日利亚": 100,
"卢旺达": 101,
"圣多美和普林西比": 102,
"塞内加尔": 103,
"塞舌尔": 104,
"塞拉利昂": 105,
"索马里": 106,
"南非": 107,
"南苏丹": 108,
"苏丹": 109,
"斯威士兰": 110,
"坦桑尼亚": 111,
"多哥": 112,
"突尼斯": 113,
"乌干达": 114,
"赞比亚": 115,
"津巴布韦": 116,
"西撒哈拉": 117,
"阿尔巴尼亚": 118,
"安道尔": 119,
"奥地利": 120,
"白俄罗斯": 121,
"波斯尼亚和黑塞哥维那": 122,
"保加利亚": 123,
"克罗地亚": 124,
"捷克": 125,
"丹麦": 126,
"爱沙尼亚": 127,
"芬兰": 128,
"希腊": 129,
"匈牙利": 130,
"冰岛": 131,
"爱尔兰": 132,
"拉脱维亚": 133,
"列支敦士登": 134,
"立陶宛": 135,
"卢森堡": 136,
"马耳他": 137,
"摩尔多瓦": 138,
"摩纳哥": 139,
"黑山": 140,
"北马其顿": 141,
"挪威": 142,
"波兰": 143,
"葡萄牙": 144,
"罗马尼亚": 145,
"圣马力诺": 146,
"塞尔维亚": 147,
"斯洛伐克": 148,
"斯洛文尼亚": 149,
"瑞典": 150,
"瑞士": 151,
"乌克兰": 152,
"梵蒂冈": 153,
"科索沃": 154,
"法罗群岛": 155,
"直布罗陀": 156,
"安提瓜和巴布达": 157,
"巴哈马": 158,
"巴巴多斯": 159,
"伯利兹": 160,
"哥斯达黎加": 161,
"古巴": 162,
"多米尼加": 163,
"多米尼加共和国": 164,
"萨尔瓦多": 165,
"格林纳达": 166,
"危地马拉": 167,
"海地": 168,
"洪都拉斯": 169,
"牙买加": 170,
"墨西哥": 171,
"尼加拉瓜": 172,
"巴拿马": 173,
"圣基茨和尼维斯": 174,
"圣卢西亚": 175,
"圣文森特和格林纳丁斯": 176,
"特立尼达和多巴哥": 177,
"百慕大": 178,
"格陵兰": 179,
"波多黎各": 180,
"美属维尔京群岛": 181,
"英属维尔京群岛": 182,
"开曼群岛": 183,
"安圭拉": 184,
"蒙特塞拉特": 185,
"阿根廷": 186,
"玻利维亚": 187,
"巴西": 188,
"智利": 189,
"哥伦比亚": 190,
"厄瓜多尔": 191,
"圭亚那": 192,
"巴拉圭": 193,
"秘鲁": 194,
"苏里南": 195,
"乌拉圭": 196,
"委内瑞拉": 197,
"法属圭亚那": 198,
"福克兰群岛": 199,
"澳大利亚": 200,
"斐济": 201,
"基里巴斯": 202,
"马绍尔群岛": 203,
"密克罗尼西亚": 204,
"瑙鲁": 205,
"新西兰": 206,
"帕劳": 207,
"巴布亚新几内亚": 208,
"萨摩亚": 209,
"所罗门群岛": 210,
"汤加": 211,
"图瓦卢": 212,
"瓦努阿图": 213,
"库克群岛": 214,
"纽埃": 215,
"法属波利尼西亚": 216,
"新喀里多尼亚": 217,
"瓦利斯和富图纳": 218,
"托克劳": 219,
"皮特凯恩群岛": 220,
"土耳其": 221,
}
def country_to_idx(country_str: str) -> str:
result = []
country_str = country_str.strip()
for country in country_str.split(";"):
clean_country = country.strip()
if clean_country not in country_to_id_map:
err_msg = f"Unknown Country: {clean_country}"
print(err_msg)
raise ValueError(err_msg)
else:
result.append(str(country_to_id_map[clean_country]))
return ";".join(result)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment