Commit e864d804 by nanziyuan

r1

parent ef55d00d
...@@ -5,9 +5,13 @@ from functools import partial ...@@ -5,9 +5,13 @@ from functools import partial
import codecritic.evaluation.metric as metric import codecritic.evaluation.metric as metric
from codecritic.utils.json import load_jsonl from codecritic.utils.json import load_jsonl
import pprint
def confidence(item): def confidence(item):
sign = 1 if item["prediction"] else -1 sign = 1 if item["prediction"] else -1
if item["confidence"] is None:
return -1
return sign * item["confidence"] return sign * item["confidence"]
def eval(scores): def eval(scores):
...@@ -37,12 +41,21 @@ if __name__ == "__main__": ...@@ -37,12 +41,21 @@ if __name__ == "__main__":
scores = load_jsonl(args.score) scores = load_jsonl(args.score)
groups = defaultdict(list) groups = defaultdict(list)
for item in scores: for item in scores:
groups[item["dataset"]].append(item) groups[item["task_id"]].append(item)
newscores = []
for dataset, lst in groups.items(): for dataset, lst in groups.items():
results = eval(lst) pass_lst = [x["pass"] for x in lst]
for r in results: if any(pass_lst):
r["dataset"] = dataset print(sum(pass_lst))
r["strategy"] = "r1_qwen_7b" newscores.extend(lst)
del r["score_func"] # results = eval(lst)
print(json.dumps(r)) # for r in results:
# r["dataset"] = dataset
# r["strategy"] = "r1_qwen_7b"
# del r["score_func"]
# print(json.dumps(r))
print(len(newscores))
labels, bscores = [x["pass"] for x in newscores], [1 if x["prediction"] else 0 for x in scores]
pprint.pp(metric.binary_metrics(labels, bscores))
...@@ -118,7 +118,7 @@ if __name__ == "__main__": ...@@ -118,7 +118,7 @@ if __name__ == "__main__":
#with ThreadPoolExecutor(max_workers=4) as executor: #with ThreadPoolExecutor(max_workers=4) as executor:
# responses = executor.map(chat_fun, prompts) # responses = executor.map(chat_fun, prompts)
responses = thread_map(chat_fun, prompts, max_workers=4) responses = thread_map(chat_fun, prompts, max_workers=8)
for item, response in zip(dataset, responses): for item, response in zip(dataset, responses):
judgement, confidence = postprocess_response(response) judgement, confidence = postprocess_response(response)
......
...@@ -4,7 +4,7 @@ llm_kit: ...@@ -4,7 +4,7 @@ llm_kit:
router_port: 8000 router_port: 8000
tensor_parallel_size: 1 tensor_parallel_size: 1
pipeline_parallel_size: 1 pipeline_parallel_size: 1
data_parallel_size: 4 data_parallel_size: 8
router_timeout: 1200 router_timeout: 1200
random_seeds: random_seeds:
- 1111 - 1111
......
...@@ -3,11 +3,11 @@ model="/share/collab/codemodel/models/DeepSeek-R1-Distill-Qwen-7B/" ...@@ -3,11 +3,11 @@ model="/share/collab/codemodel/models/DeepSeek-R1-Distill-Qwen-7B/"
data="/nfs_global/S/nanziyuan/projects/ccc/data" data="/nfs_global/S/nanziyuan/projects/ccc/data"
testset="${data}/test/qwen25_coder_inst-apps-test.jsonl" testset="${data}/test/qwen25_coder_inst-apps-test.jsonl"
evalresults="${data}/eval/qwen25_code_inst-apps-test-r1_7b_test.jsonl" evalresults="${data}/eval/qwen25_code_inst-apps-test-r1_7b.jsonl"
# python -m llmkit_data.cli.serve --config /nfs_global/S/nanziyuan/projects/ccc/src/scripts/config.yaml & python -m llmkit_data.cli.serve --config /nfs_global/S/nanziyuan/projects/ccc/src/scripts/config.yaml &
# vllm serve ${model} --max_model 12288 # vllm serve ${model} --max_model 12288
# sleep 300s sleep 300s
python -m codecritic.cli.test_r1 \ python -m codecritic.cli.test_r1 \
--model ${model} \ --model ${model} \
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment