Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
codecritic
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Ziyuan Nan
codecritic
Commits
9381291e
Commit
9381291e
authored
Nov 05, 2024
by
nzy
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
refactor: fix import & remove unused code
parent
a89b519a
Hide whitespace changes
Inline
Side-by-side
Showing
12 changed files
with
45 additions
and
82 deletions
+45
-82
codecritic/cli/run_rm_test.py
+3
-2
codecritic/cli/run_sft_test.py
+5
-5
codecritic/cli/sample_apps.py
+6
-5
codecritic/data/cov.py
+3
-2
codecritic/data/edit_distance.py
+2
-2
codecritic/data/pair_to_instr.py
+2
-2
codecritic/sampling/evaluate_code.py
+1
-1
codecritic/sampling/sample_apps.py
+0
-1
codecritic/sampling/sort_split_dataset.py
+0
-5
codecritic/utils/data.py
+22
-39
codecritic/utils/json.py
+0
-17
codecritic/utils/vllm.py
+1
-1
No files found.
codecritic/cli/run_rm_test.py
View file @
9381291e
...
...
@@ -7,8 +7,9 @@ from transformers import AutoTokenizer
import
pprint
from
pathlib
import
Path
from
utils
import
load_jsonl
,
save_jsonl
,
extract_code
,
code_template
from
utils_metric
import
group_results
,
score_pass_at_k
from
codecritic.utils.json
import
load_jsonl
,
save_jsonl
from
codecritic.utils.data
import
extract_code
,
code_template
from
codecritic.utils.metric
import
group_results
,
score_pass_at_k
def
get_rewards_from_server
(
server_url
:
str
,
messages
:
list
[
str
]):
...
...
codecritic/cli/run_sft_test.py
View file @
9381291e
...
...
@@ -2,11 +2,11 @@ import argparse
from
pathlib
import
Path
import
pprint
from
step2_cov_dataset
import
COV_PROMPT
from
utils_
vllm
import
vllm_chatcomplete
,
vllm_score
from
utils
import
load_jsonl
,
save_jsonl
,
extract_code
,
code_template
from
utils_dataset
import
mk_critic_qa
,
JUDGE_PROMPT
,
get_score_token_id
from
utils_
metric
import
group_results
,
score_pass_at_k
from
codecritic.data.cov
import
COV_PROMPT
from
codecritic.utils.
vllm
import
vllm_chatcomplete
,
vllm_score
from
codecritic.utils.json
import
load_jsonl
,
save_jsonl
from
codecritic.utils.data
import
extract_code
,
code_template
,
mk_critic_qa
,
JUDGE_PROMPT
,
get_score_token_id
from
codecritic.utils.
metric
import
group_results
,
score_pass_at_k
def
preprocess_test_item
(
item
):
...
...
codecritic/cli/sample_apps.py
View file @
9381291e
import
argparse
from
pathlib
import
Path
from
utils
import
save_json
,
save_jsonl
from
utils_vllm
import
vllm_chatcomplete
from
step1_sample_apps
import
mk_sample_prompt
from
step1_evaluate_code
import
evaluate
from
step1_sort_split_dataset
import
sort_and_split_dataset
from
codecritic.utils.json
import
save_json
,
save_jsonl
from
codecritic.utils.vllm
import
vllm_chatcomplete
from
codecritic.sampling.sample_apps
import
mk_sample_prompt
from
codecritic.sampling.evaluate_code
import
evaluate
from
codecritic.sampling.sort_split_dataset
import
sort_and_split_dataset
if
__name__
==
"__main__"
:
...
...
codecritic/data/cov.py
View file @
9381291e
...
...
@@ -3,8 +3,9 @@
import
argparse
from
itertools
import
chain
from
utils
import
load_json
,
extract_code
,
code_template
from
utils_dataset
import
mk_critic_qa
,
mk_sft_item
,
mk_critic_verify
,
mk_sft_dataset_info
,
save_dataset
,
SPLITTER
from
utils_vllm
import
vllm_chatcomplete
from
codecritic.utils.data
import
mk_critic_qa
,
mk_sft_item
,
mk_critic_verify
,
mk_sft_dataset_info
,
save_dataset
,
SPLITTER
from
codecritic.utils.vllm
import
vllm_chatcomplete
COV_PROMPT
=
"Please verify your code step by step using Markdown code blocks. After each step, explain whether it's correct or not, and if not, explain the issue."
...
...
codecritic/data/edit_distance.py
View file @
9381291e
import
argparse
from
pathlib
import
Path
from
utils
import
load_json
,
load_jsonl
,
save_json
,
save_jsonl
,
extract_code
from
utils_dataset
import
mk_preference_dataset_info
,
mk_preference_pair
,
save_dataset
from
codecritic.utils.json
import
load_json
,
load_jsonl
,
save_json
from
codecritic.utils.data
import
extract_code
,
mk_preference_dataset_info
,
mk_preference_pair
,
save_dataset
from
nltk.metrics.distance
import
edit_distance
from
collections
import
defaultdict
from
itertools
import
product
,
chain
...
...
codecritic/data/pair_to_instr.py
View file @
9381291e
...
...
@@ -4,8 +4,8 @@
# 2. Using SFT (Supervised Fine-Tuning) directly
# This experiment aims to fairly compare these two approaches.
import
argparse
from
utils
import
load_json
from
utils_dataset
import
mk_critic_qa
,
mk_critic_verify
,
mk_sft_item
,
mk_sft_dataset_info
,
save_dataset
from
codecritic.utils.json
import
load_json
from
codecritic.utils.data
import
mk_critic_qa
,
mk_critic_verify
,
mk_sft_item
,
mk_sft_dataset_info
,
save_dataset
def
convert_preference_to_sft
(
item
):
...
...
codecritic/sampling/evaluate_code.py
View file @
9381291e
...
...
@@ -7,7 +7,7 @@ import numpy as np
from
datasets
import
load_dataset
from
tqdm.contrib.concurrent
import
process_map
from
step1_
apps_test
import
run_test
from
codecritic.sampling.
apps_test
import
run_test
from
utils
import
extract_code
,
load_jsonl
,
save_jsonl
TIMEOUT
=
10
...
...
codecritic/sampling/sample_apps.py
View file @
9381291e
from
datasets
import
load_dataset
import
json
from
utils
import
save_jsonl
from
transformers
import
AutoTokenizer
...
...
codecritic/sampling/sort_split_dataset.py
View file @
9381291e
from
utils
import
load_jsonl
,
save_jsonl
def
mk_key_for_sort
(
item
):
problem_id
=
item
[
'problem_id'
]
prefix
,
idx
=
problem_id
.
split
(
'_'
)
...
...
@@ -53,5 +50,3 @@ def sort_and_split_dataset(dataset, n):
minimal_test
.
extend
(
problem
)
return
new_train
,
new_test
,
minimal_test
codecritic/utils/data.py
View file @
9381291e
from
utils
import
load_json
,
save_json
,
code_template
import
re
from
codecritic.utils.json
import
load_json
,
save_json
from
transformers
import
AutoTokenizer
codeblock_pattern
=
re
.
compile
(
r"```python(.+?)```"
,
flags
=
re
.
DOTALL
)
code_template
=
"""```python
{}
```
"""
def
mk_preference_dataset_info
(
dataset_name
):
return
{
dataset_name
:
{
"file_name"
:
f
"{dataset_name}.json"
,
"formatting"
:
"sharegpt"
,
"ranking"
:
True
,
"columns"
:
{
"messages"
:
"messages"
,
"chosen"
:
"chosen"
,
"rejected"
:
"rejected"
,
},
"tags"
:
{
"role_tag"
:
"role"
,
"content_tag"
:
"content"
,
"user_tag"
:
"user"
,
"assistant_tag"
:
"assistant"
,
"system_tag"
:
"system"
,
},
}
}
def
extract_code
(
text
:
str
):
codes
=
[
match
.
strip
()
for
match
in
re
.
findall
(
codeblock_pattern
,
text
)]
if
len
(
codes
)
>
0
:
code
=
"
\n
"
.
join
(
codes
)
return
code
else
:
return
""
def
mk_preference_pair
(
instruction
,
chosen_code
,
rejected_code
):
...
...
@@ -37,23 +31,6 @@ def mk_preference_pair(instruction, chosen_code, rejected_code):
}
def
mk_sft_dataset_info
(
dataset_name
):
return
{
dataset_name
:
{
"file_name"
:
f
"{dataset_name}.json"
,
"formatting"
:
"sharegpt"
,
"columns"
:
{
"messages"
:
"messages"
},
"tags"
:
{
"role_tag"
:
"role"
,
"content_tag"
:
"content"
,
"user_tag"
:
"user"
,
"assistant_tag"
:
"assistant"
,
"system_tag"
:
"system"
,
},
}
}
# Note that the human and observation should appear in odd positions
# while llm should appear in even positions.
def
mk_sft_item
(
messages
):
...
...
@@ -69,6 +46,8 @@ def mk_critic_qa(instruction, code):
JUDGE_PROMPT
=
"Is the code correct (Yes/No)?"
def
mk_critic_verify
(
answer
=
None
):
# answer: bool or none
message
=
[{
"role"
:
"user"
,
"content"
:
JUDGE_PROMPT
}]
...
...
@@ -99,7 +78,10 @@ def get_score_token_id(model_path, token_str="Yes"):
def
mk_critic_reason
(
codedit
,
explanation
):
user_question
=
{
"role"
:
"user"
,
"content"
:
"Edit your code in diff format to fix any issues and explain the changes."
}
user_question
=
{
"role"
:
"user"
,
"content"
:
"Edit your code in diff format to fix any issues and explain the changes."
,
}
llm_answer_content
=
f
"""
\
**Edited Code (in diff format):**
```diff
...
...
@@ -112,4 +94,5 @@ def mk_critic_reason(codedit, explanation):
llm_answer
=
{
"role"
:
"assistant"
,
"content"
:
llm_answer_content
}
return
[
user_question
,
llm_answer
]
SPLITTER
=
"__I_wish_it_were_weekends_all_the_time.__"
codecritic/utils/json.py
View file @
9381291e
import
json
import
re
def
load_jsonl
(
file_path
):
...
...
@@ -21,19 +20,3 @@ def save_jsonl(data, file_path):
def
save_json
(
data
,
file_path
,
indent
=
None
):
with
open
(
file_path
,
"w"
,
encoding
=
"utf-8"
)
as
f
:
json
.
dump
(
data
,
f
,
indent
=
indent
)
codeblock_pattern
=
re
.
compile
(
r"```python(.+?)```"
,
flags
=
re
.
DOTALL
)
code_template
=
"""```python
{}
```
"""
def
extract_code
(
text
:
str
):
codes
=
[
match
.
strip
()
for
match
in
re
.
findall
(
codeblock_pattern
,
text
)]
if
len
(
codes
)
>
0
:
code
=
"
\n
"
.
join
(
codes
)
return
code
else
:
return
""
codecritic/utils/vllm.py
View file @
9381291e
...
...
@@ -5,7 +5,7 @@ import multiprocessing
from
itertools
import
chain
from
functools
import
partial
from
utils_dataset
import
SPLITTER
from
codecritic.utils.data
import
SPLITTER
import
numpy
as
np
def
generate_worker
(
cuda_device
,
prompts
,
model_path
,
sampling_params
):
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment