r1

e864d804 · nanziyuan · ef55d00d · e864d804 · e864d804 · e864d804
Commit e864d804 authored Feb 28, 2025 by nanziyuan
Show whitespace changes
Inline Side-by-side

Showing with 25 additions and 12 deletions

codecritic/cli/eval_r1.py
+20 -7

codecritic/cli/test_r1.py
+1 -1

scripts/config.yaml
+1 -1

scripts/r1_test.sh
+3 -3

No files found.
--- a/codecritic/cli/eval_r1.py
+++ b/codecritic/cli/eval_r1.py
@@ -5,9 +5,13 @@ from functools import partial

 import codecritic.evaluation.metric as metric
 from codecritic.utils.json import load_jsonl
+import pprint
+

 def confidence(item):
    sign = 1 if item["prediction"] else -1
+    if item["confidence"] is None:
+        return -1
    return sign * item["confidence"]

 def eval(scores):
@@ -37,12 +41,21 @@ if __name__ == "__main__":
    scores = load_jsonl(args.score)
    groups = defaultdict(list)
    for item in scores:
-        groups[item["dataset"]].append(item)
+        groups[item["task_id"]].append(item)

+    newscores = []
    for dataset, lst in groups.items():
-        results = eval(lst)
-        for r in results:
-            r["dataset"] = dataset
-            r["strategy"] = "r1_qwen_7b"
-            del r["score_func"]
-            print(json.dumps(r))
+        pass_lst = [x["pass"] for x in lst]
+        if any(pass_lst):
+            print(sum(pass_lst))
+            newscores.extend(lst)
+        # results = eval(lst)
+        # for r in results:
+        #     r["dataset"] = dataset
+        #     r["strategy"] = "r1_qwen_7b"
+        #     del r["score_func"]
+        #     print(json.dumps(r))
+
+    print(len(newscores))
+    labels, bscores = [x["pass"] for x in newscores], [1 if x["prediction"] else 0 for x in scores]
+    pprint.pp(metric.binary_metrics(labels, bscores))
--- a/codecritic/cli/test_r1.py
+++ b/codecritic/cli/test_r1.py
@@ -118,7 +118,7 @@ if __name__ == "__main__":
    #with ThreadPoolExecutor(max_workers=4) as executor:
    #    responses = executor.map(chat_fun, prompts)

-    responses = thread_map(chat_fun, prompts, max_workers=4)
+    responses = thread_map(chat_fun, prompts, max_workers=8)

    for item, response in zip(dataset, responses):
        judgement, confidence = postprocess_response(response)

--- a/scripts/config.yaml
+++ b/scripts/config.yaml
@@ -4,7 +4,7 @@ llm_kit:
  router_port: 8000
  tensor_parallel_size: 1
  pipeline_parallel_size: 1
-  data_parallel_size: 4
+  data_parallel_size: 8
  router_timeout: 1200
  random_seeds:
    - 1111

--- a/scripts/r1_test.sh
+++ b/scripts/r1_test.sh
@@ -3,11 +3,11 @@ model="/share/collab/codemodel/models/DeepSeek-R1-Distill-Qwen-7B/"

 data="/nfs_global/S/nanziyuan/projects/ccc/data"
 testset="${data}/test/qwen25_coder_inst-apps-test.jsonl"
-evalresults="${data}/eval/qwen25_code_inst-apps-test-r1_7b_test.jsonl"
+evalresults="${data}/eval/qwen25_code_inst-apps-test-r1_7b.jsonl"

-# python -m llmkit_data.cli.serve --config /nfs_global/S/nanziyuan/projects/ccc/src/scripts/config.yaml &
+python -m llmkit_data.cli.serve --config /nfs_global/S/nanziyuan/projects/ccc/src/scripts/config.yaml &
 # vllm serve ${model} --max_model 12288
-# sleep 300s
+sleep 300s

 python -m codecritic.cli.test_r1 \
       --model ${model} \