Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
P
papertools
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Ziyuan Nan
papertools
Commits
2693f2e7
Commit
2693f2e7
authored
May 09, 2025
by
matianyun
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Upload New File
parent
6da5da74
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
301 additions
and
0 deletions
+301
-0
papertools_niuren_ccfa/niurenpipei_update.py
+301
-0
No files found.
papertools_niuren_ccfa/niurenpipei_update.py
0 → 100644
View file @
2693f2e7
import
os
import
sys
os
.
chdir
(
os
.
path
.
dirname
(
__file__
))
import
json
import
pandas
as
pd
from
tqdm
import
tqdm
import
openpyxl
from
copy
import
copy
# from joblib import Parallel, delayed
from
utils
import
standardized_name
,
name_in_niuren_list
input_file_path
=
'info/测试大表2.xlsx'
# input_file_path = 'info/论文被引用统计-陈老师-截止2025年X月XX日.xlsx'
output_file_path
=
'output/论文被引用统计-陈老师-截止2025年X月XX日_牛人筛选.xlsx'
output_dir
=
os
.
path
.
dirname
(
output_file_path
)
if
not
os
.
path
.
exists
(
output_dir
):
os
.
makedirs
(
output_dir
)
niuren_pool_path
=
'info/niuren_pool.csv'
true_niuren_papers_path
=
'info/true_niuren_papers'
fake_niuren_papers_path
=
'info/fake_niuren_papers'
# 全局变量,存储加载的数据
# NIUREN_POOL_NAMES = []
# TRUE_NIUREN_NAMES = []
# TRUE_NIUREN_PAPERS = []
# FAKE_NIUREN_NAMES = []
# FAKE_NIUREN_PAPERS = []
def
load_niuren_pool
(
niuren_pool_path
):
niuren_pool
=
pd
.
read_csv
(
niuren_pool_path
,
encoding
=
'utf-8-sig'
)
niuren_pool_names
=
niuren_pool
[
"name"
]
.
tolist
()
niuren_pool_names
=
[
name
.
replace
(
"
\xa0
"
,
" "
)
for
name
in
niuren_pool_names
]
# 去除空格
return
niuren_pool_names
def
load_true_niuren
(
true_niuren
,
true_niuren_papers_path
):
true_niuren
[
"别名列表"
]
=
None
true_niuren
[
"别名列表"
]
=
true_niuren
[
"别名列表(各种奇奇怪怪的名字格式,比如first name和second name的顺序,以;分隔)"
]
.
apply
(
lambda
x
:
[
standardized_name
(
i
.
strip
())
for
i
in
x
.
split
(
";"
)
if
i
!=
""
]
if
isinstance
(
x
,
str
)
else
[]
)
true_niuren_names
=
[]
true_niuren_papers
=
[]
# 构建牛人姓名列表
for
_
,
row
in
true_niuren
.
iterrows
():
if
pd
.
isna
(
row
[
"姓名"
]):
break
if
row
[
"别名列表"
]:
true_niuren_names
.
append
([
standardized_name
(
row
[
"姓名"
])]
+
row
[
"别名列表"
])
else
:
true_niuren_names
.
append
(
standardized_name
(
row
[
"姓名"
]))
# 构建牛人论文列表
for
true_niuren_idx
,
_
in
enumerate
(
true_niuren_names
):
paper_file_path
=
os
.
path
.
join
(
true_niuren_papers_path
,
f
"{true_niuren_idx+1}.xlsx"
)
if
os
.
path
.
exists
(
paper_file_path
):
papers_df
=
pd
.
read_excel
(
paper_file_path
,
usecols
=
[
0
])
papers_list
=
papers_df
.
iloc
[:,
0
]
.
dropna
()
.
tolist
()
papers_list
=
[
paper
.
lower
()
for
paper
in
papers_list
]
true_niuren_papers
.
append
(
papers_list
)
else
:
true_niuren_papers
.
append
([])
return
true_niuren_names
,
true_niuren_papers
def
load_fake_niuren
(
fake_niuren
,
fake_niuren_papers_path
):
# 读取 "全局非牛人" 工作簿
fake_niuren_names
=
[]
fake_niuren_papers
=
[]
# 构建非牛人姓名列表
for
_
,
row
in
fake_niuren
.
iterrows
():
if
pd
.
isna
(
row
[
"姓名"
]):
break
fake_niuren_names
.
append
(
standardized_name
(
row
[
"姓名"
]))
# 构建非牛人论文列表
for
fake_niuren_idx
,
_
in
enumerate
(
fake_niuren_names
):
paper_file_path
=
os
.
path
.
join
(
fake_niuren_papers_path
,
f
"{fake_niuren_idx+1}.xlsx"
)
if
os
.
path
.
exists
(
paper_file_path
):
papers_df
=
pd
.
read_excel
(
paper_file_path
,
usecols
=
[
0
])
papers_list
=
papers_df
.
iloc
[:,
0
]
.
dropna
()
.
tolist
()
papers_list
=
[
paper
.
lower
()
for
paper
in
papers_list
]
fake_niuren_papers
.
append
(
papers_list
)
else
:
fake_niuren_papers
.
append
([])
return
fake_niuren_names
,
fake_niuren_papers
def
check_niuren
(
authors
,
title
,
niuren_pool_names
,
true_niuren_names
,
true_niuren_papers
,
fake_niuren_names
,
fake_niuren_papers
):
niuren
=
[]
order
=
[]
suspected_niuren
=
[]
for
author_idx
,
author
in
enumerate
(
authors
):
if
author
==
""
:
continue
# 作者姓名能否在牛人池中找到
name_index
=
name_in_niuren_list
(
standardized_name
(
author
),
niuren_pool_names
)
if
name_index
==
-
1
:
continue
# 作者姓名能否在全局牛人中找到
true_name_index
=
name_in_niuren_list
(
standardized_name
(
author
),
true_niuren_names
)
if
true_name_index
!=
-
1
:
if
title
.
strip
()
.
lower
()
in
true_niuren_papers
[
true_name_index
]:
niuren
.
append
(
author
)
order
.
append
(
author_idx
+
1
)
continue
# 作者姓名能否在全局非牛人中找到
fake_name_index
=
name_in_niuren_list
(
standardized_name
(
author
),
fake_niuren_names
)
if
fake_name_index
!=
-
1
:
if
title
.
strip
()
.
lower
()
in
fake_niuren_papers
[
fake_name_index
]:
continue
suspected_niuren
.
append
([
author
,
name_index
+
1
])
niuren_str
=
";"
.
join
(
niuren
)
order_str
=
";"
.
join
([
str
(
i
)
for
i
in
order
])
suspected_niuren_str
=
";"
.
join
([
f
"{name}({index})"
for
name
,
index
in
suspected_niuren
])
return
niuren_str
,
order_str
,
suspected_niuren_str
def
process_row
(
index
,
row
,
niuren_pool_names
,
true_niuren_names
,
true_niuren_papers
,
fake_niuren_names
,
fake_niuren_papers
):
try
:
authors
=
row
.
get
(
'引文作者'
,
''
)
title
=
row
.
get
(
'引文名称'
,
''
)
if
not
isinstance
(
authors
,
str
):
return
index
,
""
,
""
,
""
authors
=
[
i
.
strip
()
for
i
in
authors
.
split
(
";"
)
if
i
!=
""
]
niuren_true
,
niuren_true_order
,
suspected_niuren
=
check_niuren
(
authors
,
title
,
niuren_pool_names
,
true_niuren_names
,
true_niuren_papers
,
fake_niuren_names
,
fake_niuren_papers
)
return
index
,
niuren_true
,
niuren_true_order
,
suspected_niuren
except
Exception
as
e
:
print
(
f
"处理行 {index} 时发生错误: {e}"
)
return
index
,
""
,
""
,
""
if
__name__
==
"__main__"
:
breakpoint
()
print
(
"正在读取全局牛人..."
)
true_niuren
=
pd
.
read_excel
(
input_file_path
,
sheet_name
=
"全局牛人"
)
print
(
"正在读取全局非牛人..."
)
fake_niuren
=
pd
.
read_excel
(
input_file_path
,
sheet_name
=
"全局非牛人"
)
print
(
"正在处理牛人池..."
)
niuren_pool_names
=
load_niuren_pool
(
niuren_pool_path
)
print
(
"正在处理真牛人数据..."
)
true_niuren_names
,
true_niuren_papers
=
load_true_niuren
(
true_niuren
,
true_niuren_papers_path
)
print
(
"正在处理非牛人数据..."
)
fake_niuren_names
,
fake_niuren_papers
=
load_fake_niuren
(
fake_niuren
,
fake_niuren_papers_path
)
# 读取表头(第4行作为列名)
original_header
=
pd
.
read_excel
(
input_file_path
,
nrows
=
0
,
header
=
3
)
column_names
=
original_header
.
columns
.
tolist
()
# 读取数据(从第8行开始)
input_df
=
pd
.
read_excel
(
input_file_path
,
skiprows
=
7
,
header
=
None
,
names
=
column_names
)
# 输出表头和数据的基本信息
print
(
"表头元素:"
)
print
(
column_names
)
print
(
"
\n
数据行数:"
,
input_df
.
shape
[
0
])
print
(
"数据列数:"
,
input_df
.
shape
[
1
])
# 检查列是否存在,不存在则添加
if
'疑似牛人'
not
in
input_df
.
columns
:
input_df
[
'疑似牛人'
]
=
None
# 确保牛人相关列存在
niuren_col
=
'牛人
\n
(参考全局牛人列表)'
niuren_order_col
=
'牛人署名顺序
\n
'
# 如果列不存在,添加它们
if
niuren_col
not
in
input_df
.
columns
:
input_df
[
niuren_col
]
=
None
if
niuren_order_col
not
in
input_df
.
columns
:
input_df
[
niuren_order_col
]
=
None
print
(
"开始并行处理数据..."
)
# 并行处理,将加载的数据传递给每个进程
# results = Parallel(n_jobs=-1)(
# results = Parallel(n_jobs=1)(
# delayed(process_row)(
# index, row,
# niuren_pool_names, true_niuren_names, true_niuren_papers,
# fake_niuren_names, fake_niuren_papers
# )
# for index, row in tqdm(input_df.iterrows())
# )
results
=
[]
for
index
,
row
in
tqdm
(
input_df
.
iterrows
(),
total
=
input_df
.
shape
[
0
]):
result
=
process_row
(
index
,
row
,
niuren_pool_names
,
true_niuren_names
,
true_niuren_papers
,
fake_niuren_names
,
fake_niuren_papers
)
results
.
append
(
result
)
for
index
,
niuren_true
,
niuren_true_order
,
suspected_niuren
in
results
:
input_df
.
at
[
index
,
niuren_col
]
=
niuren_true
input_df
.
at
[
index
,
niuren_order_col
]
=
niuren_true_order
input_df
.
at
[
index
,
'疑似牛人'
]
=
suspected_niuren
# 打开原始Excel文件
print
(
"正在读取原始Excel文件以保留格式..."
)
wb_original
=
openpyxl
.
load_workbook
(
input_file_path
)
# 创建新工作簿
wb_new
=
openpyxl
.
Workbook
()
# 删除默认创建的空白工作表
if
'Sheet'
in
wb_new
.
sheetnames
:
del
wb_new
[
'Sheet'
]
# 复制所有工作表
for
sheet_name
in
wb_original
.
sheetnames
:
ws_original
=
wb_original
[
sheet_name
]
ws_new
=
wb_new
.
create_sheet
(
sheet_name
)
# 复制工作表属性
ws_new
.
sheet_properties
=
copy
(
ws_original
.
sheet_properties
)
ws_new
.
sheet_format
=
copy
(
ws_original
.
sheet_format
)
# 复制整个工作表的内容和格式
for
row
in
ws_original
.
rows
:
for
cell
in
row
:
new_cell
=
ws_new
.
cell
(
row
=
cell
.
row
,
column
=
cell
.
column
,
value
=
cell
.
value
)
if
cell
.
has_style
:
new_cell
.
font
=
copy
(
cell
.
font
)
new_cell
.
border
=
copy
(
cell
.
border
)
new_cell
.
fill
=
copy
(
cell
.
fill
)
new_cell
.
number_format
=
copy
(
cell
.
number_format
)
new_cell
.
protection
=
copy
(
cell
.
protection
)
new_cell
.
alignment
=
copy
(
cell
.
alignment
)
# 复制合并单元格
for
merged_cell_range
in
ws_original
.
merged_cells
.
ranges
:
ws_new
.
merge_cells
(
str
(
merged_cell_range
))
# 获取主工作表(第一个工作表)
main_sheet_name
=
wb_original
.
sheetnames
[
0
]
ws_new
=
wb_new
[
main_sheet_name
]
# 更新主工作表中的牛人相关数据
print
(
"正在更新主工作表中的牛人数据..."
)
# 找出列索引(Excel中列是从1开始的)
niuren_col_index
=
None
niuren_order_col_index
=
None
suspected_niuren_col_index
=
None
# 获取第4行(索引从1开始)的所有单元格值
header_row
=
[
cell
.
value
for
cell
in
ws_new
[
4
]]
# 在这些值中查找列名对应的索引
for
i
,
cell_value
in
enumerate
(
header_row
,
start
=
1
):
if
cell_value
==
niuren_col
:
niuren_col_index
=
i
elif
cell_value
==
niuren_order_col
:
niuren_order_col_index
=
i
elif
cell_value
==
'疑似牛人'
:
suspected_niuren_col_index
=
i
# 如果找不到列,添加新列
max_col
=
ws_new
.
max_column
if
niuren_col_index
is
None
:
niuren_col_index
=
max_col
+
1
ws_new
.
cell
(
row
=
4
,
column
=
niuren_col_index
,
value
=
niuren_col
)
max_col
+=
1
if
niuren_order_col_index
is
None
:
niuren_order_col_index
=
max_col
+
1
ws_new
.
cell
(
row
=
4
,
column
=
niuren_order_col_index
,
value
=
niuren_order_col
)
max_col
+=
1
if
suspected_niuren_col_index
is
None
:
suspected_niuren_col_index
=
max_col
+
1
ws_new
.
cell
(
row
=
4
,
column
=
suspected_niuren_col_index
,
value
=
'疑似牛人'
)
# 更新数据(从第8行开始)
# for i, row in input_df.iterrows():
for
i
,
(
index
,
row
)
in
enumerate
(
input_df
.
iterrows
(),
start
=
7
):
# excel_row = i + 8 # 转换为Excel的行号(从1开始)
print
(
i
,
niuren_col_index
)
# 只更新牛人相关的三列
ws_new
.
cell
(
row
=
i
,
column
=
niuren_col_index
,
value
=
row
[
niuren_col
])
ws_new
.
cell
(
row
=
i
,
column
=
niuren_order_col_index
,
value
=
row
[
niuren_order_col
])
ws_new
.
cell
(
row
=
i
,
column
=
suspected_niuren_col_index
,
value
=
row
[
'疑似牛人'
])
# 保存新Excel文件
print
(
f
"正在保存为Excel格式 {output_file_path} ..."
)
wb_new
.
save
(
output_file_path
)
print
(
f
"成功保存到 {output_file_path},保留了原始格式"
)
print
(
"处理完成!"
)
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment