Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
codecritic
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Ziyuan Nan
codecritic
Commits
78b884ce
Commit
78b884ce
authored
Nov 25, 2024
by
nzy
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
refactor: edit_distance
todo: move the functions in utils.data to data
parent
afcf4289
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
49 additions
and
41 deletions
+49
-41
codecritic/cli/mk_rm_dataset.py
+33
-0
codecritic/data/edit_distance.py
+16
-28
codecritic/utils/data.py
+0
-13
No files found.
codecritic/cli/mk_rm_dataset.py
0 → 100644
View file @
78b884ce
import
argparse
from
pathlib
import
Path
from
codecritic.utils.json
import
load_json
from
codecritic.utils.data
import
save_jsonl_dataset
from
codecritic.data.edit_distance
import
(
mk_problem_groups
,
calculate_edit_distances
,
mk_edit_distance_dataset
,
)
if
__name__
==
"__main__"
:
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
"--dataset_dir"
,
type
=
str
)
parser
.
add_argument
(
"--output_dir"
,
type
=
str
)
parser
.
add_argument
(
"--is_max"
,
type
=
bool
,
required
=
True
)
args
=
parser
.
parse_args
()
dataset_dir
=
Path
(
args
.
dataset_dir
)
train_path
=
dataset_dir
/
"train.jsonl"
sampling_params
=
load_json
(
dataset_dir
/
"sampling_params.json"
)
problems
=
mk_problem_groups
(
train_path
,
sampling_params
[
"n"
])
all_edit_distance_pairs
=
calculate_edit_distances
(
problems
)
postfix
=
"max"
if
args
.
is_max
else
"min"
dataset_name
=
f
"apps_edit_distance_{postfix}"
preference_pairs
,
metadata
=
mk_edit_distance_dataset
(
all_edit_distance_pairs
,
10
*
1000
,
5
,
is_max
=
args
.
is_max
)
save_jsonl_dataset
(
preference_pairs
,
args
.
output_dir
)
codecritic/data/edit_distance.py
View file @
78b884ce
import
argparse
from
pathlib
import
Path
from
codecritic.utils.json
import
load_json
,
load_jsonl
from
codecritic.utils.data
import
extract_code
,
mk_preference_pair
,
save_jsonl_dataset
from
codecritic.utils.json
import
load_jsonl
from
codecritic.utils.data
import
extract_code
from
nltk.metrics.distance
import
edit_distance
from
collections
import
defaultdict
from
itertools
import
product
,
chain
import
multiprocessing
from
tqdm.contrib.concurrent
import
process_map
def
mk_preference_pair
(
instruction
,
chosen_code
,
rejected_code
):
return
{
"messages"
:
[
{
"role"
:
"user"
,
"content"
:
instruction
},
],
"chosen"
:
{
"role"
:
"assistant"
,
"content"
:
code_template
.
format
(
chosen_code
)},
"rejected"
:
{
"role"
:
"assistant"
,
"content"
:
code_template
.
format
(
rejected_code
),
},
}
def
mk_problem_groups
(
train_dataset_path
,
n
):
train_dataset
=
load_jsonl
(
train_dataset_path
)
...
...
@@ -86,27 +96,4 @@ def mk_edit_distance_dataset(all_pairs, k, n, is_max=True):
preference_pairs
.
append
(
mk_preference_pair
(
instr
,
pair
[
0
],
pair
[
1
]))
pairs_metadata
.
append
(
dict
(
problem_id
=
problem_id
,
edit_distance
=
distance
))
return
preference_pairs
,
pairs_metadata
if
__name__
==
"__main__"
:
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
"--dataset_dir"
,
type
=
str
)
parser
.
add_argument
(
"--output_dir"
,
type
=
str
)
parser
.
add_argument
(
"--is_max"
,
type
=
bool
,
required
=
True
)
args
=
parser
.
parse_args
()
dataset_dir
=
Path
(
args
.
dataset_dir
)
train_path
=
dataset_dir
/
"train.jsonl"
sampling_params
=
load_json
(
dataset_dir
/
"sampling_params.json"
)
problems
=
mk_problem_groups
(
train_path
,
sampling_params
[
"n"
])
all_edit_distance_pairs
=
calculate_edit_distances
(
problems
)
postfix
=
"max"
if
args
.
is_max
else
"min"
dataset_name
=
f
"apps_edit_distance_{postfix}"
preference_pairs
,
metadata
=
mk_edit_distance_dataset
(
all_edit_distance_pairs
,
10
*
1000
,
5
,
is_max
=
args
.
is_max
)
save_jsonl_dataset
(
preference_pairs
,
args
.
output_dir
)
return
preference_pairs
,
pairs_metadata
\ No newline at end of file
codecritic/utils/data.py
View file @
78b884ce
...
...
@@ -19,19 +19,6 @@ def extract_code(text: str):
return
""
def
mk_preference_pair
(
instruction
,
chosen_code
,
rejected_code
):
return
{
"messages"
:
[
{
"role"
:
"user"
,
"content"
:
instruction
},
],
"chosen"
:
{
"role"
:
"assistant"
,
"content"
:
code_template
.
format
(
chosen_code
)},
"rejected"
:
{
"role"
:
"assistant"
,
"content"
:
code_template
.
format
(
rejected_code
),
},
}
# Note that the human and observation should appear in odd positions
# while llm should appear in even positions.
def
mk_messages
(
messages
):
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment