Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
P
papertools
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Ziyuan Nan
papertools
Commits
e46e4e8c
Commit
e46e4e8c
authored
May 11, 2025
by
zhengxinhan
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
solve the compound first name problem
parent
e02d76c7
Show whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
100 additions
and
19 deletions
+100
-19
papertools_niuren_ccfa/__pycache__/utils.cpython-310.pyc
+0
-0
papertools_niuren_ccfa/info/doubohan.xlsx
+0
-0
papertools_niuren_ccfa/niurenpipei.py
+2
-2
papertools_niuren_ccfa/niurenpipei_xinbiao.py
+15
-2
papertools_niuren_ccfa/output/doubohan_牛人筛选.xlsx
+0
-0
papertools_niuren_ccfa/utils.py
+83
-15
No files found.
papertools_niuren_ccfa/__pycache__/utils.cpython-310.pyc
View file @
e46e4e8c
No preview for this file type
papertools_niuren_ccfa/info/doubohan.xlsx
0 → 100644
View file @
e46e4e8c
File added
papertools_niuren_ccfa/niurenpipei.py
View file @
e46e4e8c
...
...
@@ -9,8 +9,8 @@ from copy import copy
from
joblib
import
Parallel
,
delayed
from
utils
import
standardized_name
,
name_in_niuren_list
input_file_path
=
'info/
论文被引用统计-陈老师-截止2025年X月XX日
.xlsx'
output_file_path
=
'output/
论文被引用统计-陈老师-截止2025年X月XX日
_牛人筛选.xlsx'
input_file_path
=
'info/
doubohan
.xlsx'
output_file_path
=
'output/
doubohan
_牛人筛选.xlsx'
output_dir
=
os
.
path
.
dirname
(
output_file_path
)
if
not
os
.
path
.
exists
(
output_dir
):
...
...
papertools_niuren_ccfa/niurenpipei_xinbiao.py
View file @
e46e4e8c
...
...
@@ -9,8 +9,14 @@ from copy import copy
from
joblib
import
Parallel
,
delayed
from
utils
import
standardized_name
,
name_in_niuren_list
<<<<<<<
Updated
upstream
:
papertools_niuren_ccfa
/
niurenpipei_xinbiao
.
py
input_file_path
=
'info/新表-论文被引用统计-陈老师-截止2025年X月XX日.xlsx'
output_file_path
=
'output/新表-论文被引用统计-陈老师-截止2025年X月XX日_牛人筛选.xlsx'
=======
input_file_path
=
'info/doubohan.xlsx'
# input_file_path = 'info/论文被引用统计-陈老师-截止2025年X月XX日.xlsx'
output_file_path
=
'output/doubohan_牛人筛选.xlsx'
>>>>>>>
Stashed
changes
:
papertools_niuren_ccfa
/
niurenpipei_update
.
py
output_dir
=
os
.
path
.
dirname
(
output_file_path
)
if
not
os
.
path
.
exists
(
output_dir
):
...
...
@@ -153,11 +159,11 @@ if __name__ == "__main__":
fake_niuren_names
,
fake_niuren_papers
=
load_fake_niuren
(
fake_niuren
,
fake_niuren_papers_path
)
# 读取表头(第4行作为列名)
original_header
=
pd
.
read_excel
(
input_file_path
,
nrows
=
0
,
header
=
3
)
original_header
=
pd
.
read_excel
(
input_file_path
,
nrows
=
0
,
header
=
2
)
column_names
=
original_header
.
columns
.
tolist
()
# 读取数据(从第8行开始)
input_df
=
pd
.
read_excel
(
input_file_path
,
skiprows
=
7
,
header
=
None
,
names
=
column_names
)
input_df
=
pd
.
read_excel
(
input_file_path
,
skiprows
=
2
,
header
=
None
,
names
=
column_names
)
# 输出表头和数据的基本信息
print
(
"表头元素:"
)
...
...
@@ -273,9 +279,16 @@ if __name__ == "__main__":
ws_new
.
cell
(
row
=
4
,
column
=
suspected_niuren_col_index
,
value
=
'疑似牛人'
)
# 更新数据(从第8行开始)
<<<<<<<
Updated
upstream
:
papertools_niuren_ccfa
/
niurenpipei_xinbiao
.
py
for
i
,
row
in
input_df
.
iterrows
():
excel_row
=
i
+
8
# 转换为Excel的行号(从1开始)
=======
# for i, row in input_df.iterrows():
for
i
,
(
index
,
row
)
in
enumerate
(
input_df
.
iterrows
(),
start
=
3
):
# excel_row = i + 8 # 转换为Excel的行号(从1开始)
print
(
i
,
niuren_col_index
)
>>>>>>>
Stashed
changes
:
papertools_niuren_ccfa
/
niurenpipei_update
.
py
# 只更新牛人相关的三列
ws_new
.
cell
(
row
=
excel_row
,
column
=
niuren_col_index
,
value
=
row
[
niuren_col
])
ws_new
.
cell
(
row
=
excel_row
,
column
=
niuren_order_col_index
,
value
=
row
[
niuren_order_col
])
...
...
papertools_niuren_ccfa/output/doubohan_牛人筛选.xlsx
0 → 100644
View file @
e46e4e8c
File added
papertools_niuren_ccfa/utils.py
View file @
e46e4e8c
...
...
@@ -2,8 +2,8 @@
Author: zhengxinhan
Date: 2025-05-08 15:02:02
LastEditors: zhengxinhan
LastEditTime: 2025-05-
09 02:46:29
FilePath: /papertools
-master/niurenshaixuan
/utils.py
LastEditTime: 2025-05-
11 12:42:43
FilePath: /papertools
_niuren_ccfa
/utils.py
Description:
Copyright (c) 2025 by m13521952989@163.com, All Rights Reserved.
...
...
@@ -11,66 +11,132 @@ Copyright (c) 2025 by m13521952989@163.com, All Rights Reserved.
def
standardized_name
(
name
):
if
","
in
name
:
return
name
# 名人列表中存在一些名字是First Name Last Name的格式,将其转换为Last Name, First Name格式
# 例如:'John Doe' -> 'Doe, John','M. Jane Smith' -> 'Smith, Jane M.'
# 添加常见的姓氏前缀列表
surname_prefixes
=
[
"De"
,
"Del"
,
"Della"
,
"Di"
,
"Da"
,
"Van"
,
"Von"
,
"Le"
,
"La"
,
"O'"
,
"Mc"
,
"Mac"
,
"Al"
,
"El"
,
"Bin"
,
"Ben"
,
"Ibn"
,
"Ter"
,
"Saint"
,
"St."
,
"Dos"
,
"Das"
,
"Los"
,
"Las"
,
"San"
]
# 特殊字符串处理
special_str
=
[
"."
,
"Ms."
,
"Mr."
,
"Mrs."
,
"Dr."
,
"Prof."
,
"PhD"
,
"MD"
,
"Jr."
,
"Sr."
,
"The"
,
"Honorable"
]
name_split
=
name
.
split
(
" "
)
name_split
=
[
item
.
strip
()
for
item
in
name_split
if
item
.
strip
()
!=
""
]
name_split
=
[
item
for
item
in
name_split
if
item
not
in
special_str
]
# Grigory Isaakovich Barenblatt --> Barenblatt, Alexa, Marc Isaakovich
# 如果分割后没有部分,直接返回原名
if
not
name_split
:
return
name
# 单部分名字处理(只有一个单词)
if
len
(
name_split
)
==
1
:
return
name_split
[
0
]
# 两部分名字处理
if
len
(
name_split
)
==
2
:
first_name
=
name_split
[
0
]
last_name
=
name_split
[
1
]
new_name
=
f
"{last_name}, {first_name}"
# 三部分名字处理
elif
len
(
name_split
)
==
3
:
if
name_split
[
0
]
.
startswith
(
"("
)
and
name_split
[
0
]
.
endswith
(
")"
):
# 检查中间部分是否为姓氏前缀
if
name_split
[
1
]
in
surname_prefixes
:
# 例如 "Barbara De Salvo" 应该变成 "De Salvo, Barbara"
first_name
=
name_split
[
0
]
last_name
=
f
"{name_split[1]} {name_split[2]}"
new_name
=
f
"{last_name}, {first_name}"
# 以下是原有的其他情况处理
elif
name_split
[
0
]
.
startswith
(
"("
)
and
name_split
[
0
]
.
endswith
(
")"
):
# (Alexander) Philip Dawid
first_name
=
f
"{name_split[1]} {name_split[0]}"
last_name
=
f
"{name_split[2]}"
new_name
=
f
"{last_name}, {first_name}"
elif
name_split
[
1
]
.
startswith
(
"("
)
and
name_split
[
1
]
.
endswith
(
")"
):
# Xinyan (Tracy) Cui --> Cui, Xinyan (Tracy)
first_name
=
f
"{name_split[0]} {name_split[1]}"
last_name
=
f
"{name_split[2]}"
new_name
=
f
"{last_name}, {first_name}"
elif
name_split
[
2
]
.
startswith
(
"("
)
and
name_split
[
2
]
.
endswith
(
")"
):
# Ye Fred (Ying)实际是姓Ye,名Ying,英文名Fred, 需要将其转换为"Ye, Ying Fred"
# Zhu Jesse (Jingxu)实际是姓Zhu,名Jingxu,英文名Jesse, 需要将其转换为"Zhu, Jingxu Jesse"
first_name
=
f
"{name_split[2][1:-1]} ({name_split[1]})"
last_name
=
f
"{name_split[0]}"
new_name
=
f
"{last_name}, {first_name}"
elif
name_split
[
0
]
.
endswith
(
"."
):
# M. Jane Smith --> Smith, Jane M.
# K.W. Michael Siu --> Siu, Michael K.W.
first_name
=
f
"{name_split[1]} {name_split[0]}"
last_name
=
name_split
[
2
]
new_name
=
f
"{last_name}, {first_name}"
elif
name_split
[
1
]
.
endswith
(
"."
):
# Jane M. Smith --> Smith, Jane M.
# Pierre J.H. Richardson --> Richardson, Pierre J.H.
first_name
=
f
"{name_split[0]} {name_split[1]}"
last_name
=
name_split
[
2
]
new_name
=
f
"{last_name}, {first_name}"
elif
name_split
[
2
]
.
endswith
(
"."
):
# Wimmer-Schweingruber Robert F. --> Wimmer-Schweingruber, Robert F.
# Wilderer Peter A. --> Wilderer, Peter A.
first_name
=
f
"{name_split[1]} {name_split[2]}"
last_name
=
f
"{name_split[0]}"
new_name
=
f
"{last_name}, {first_name}"
else
:
# William Nelson Joy --> Joy, William Nelson
# Michael J Carey --> Carey, Michael J
# len_3_list.append(" ".join(name_split))
first_name
=
f
"{name_split[0]} {name_split[1]}"
last_name
=
name_split
[
2
]
new_name
=
f
"{last_name}, {first_name}"
# print(f"{name} --> {new_name}")
# 四部分或更多的名字处理
else
:
# 检查复合姓氏模式
compound_found
=
False
# 检查倒数第二部分是否为姓氏前缀(确保列表长度足够)
if
len
(
name_split
)
>=
2
and
name_split
[
-
2
]
in
surname_prefixes
:
last_name
=
f
"{name_split[-2]} {name_split[-1]}"
first_name
=
" "
.
join
(
name_split
[:
-
2
])
compound_found
=
True
# 从前往后检查是否有姓氏前缀组合
if
not
compound_found
:
for
i
in
range
(
len
(
name_split
)
-
1
):
if
name_split
[
i
]
in
surname_prefixes
:
# 可能是复合姓氏的开始
# 例如 "Maria Del Carmen Gomez" 中的 "Del Carmen"
potential_compound
=
True
compound_parts
=
[
name_split
[
i
]]
# 查看后续部分是否也可能是复合姓氏的一部分
j
=
i
+
1
while
j
<
len
(
name_split
)
-
1
and
potential_compound
:
if
name_split
[
j
]
in
surname_prefixes
or
name_split
[
j
][
0
]
.
isupper
():
compound_parts
.
append
(
name_split
[
j
])
j
+=
1
else
:
# lens greater than 3 这部分名字应该不会跟其他部分名字出现重复
potential_compound
=
False
if
potential_compound
and
j
<
len
(
name_split
):
# 找到了复合姓氏
compound_parts
.
append
(
name_split
[
j
])
last_name
=
" "
.
join
(
compound_parts
)
first_name
=
" "
.
join
(
name_split
[:
i
])
compound_found
=
True
break
# 如果没有找到复合姓氏,使用默认处理
if
not
compound_found
:
last_name
=
name_split
[
-
1
]
first_name
=
" "
.
join
(
name_split
[:
-
1
])
new_name
=
f
"{last_name}, {first_name}"
return
new_name
def
convert_to_lowercase
(
element
):
if
isinstance
(
element
,
list
):
return
[
convert_to_lowercase
(
e
)
for
e
in
element
]
elif
isinstance
(
element
,
str
):
return
element
.
lower
()
else
:
return
element
def
name_in_niuren_list
(
name
,
niuren_name_list
):
"""
判断名字是否在牛人列表中
...
...
@@ -78,6 +144,8 @@ def name_in_niuren_list(name, niuren_name_list):
:param niuren_name_list: 牛人列表
:return: 如果在牛人列表中,返回 True,否则返回 False
"""
name
=
convert_to_lowercase
(
name
)
niuren_name_list
=
convert_to_lowercase
(
niuren_name_list
)
for
idx
,
niuren_name
in
enumerate
(
niuren_name_list
):
if
isinstance
(
niuren_name
,
str
):
if
name
==
niuren_name
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment