set -xe

model="/lustre/S/huangdi/open_for_out/models/Qwen2.5-Coder-7B-Instruct/"
project="/lustre/S/nanziyuan/projects/ccc"
data="${project}/data"
modelname="qwen25_coder_inst"

trainset="${data}/train/${modelname}-apps-train.jsonl"
testset="${data}/test/${modelname}-apps-test.jsonl"
train_selected_pairs="${data}/train/${modelname}-apps-train-selected_pairs.jsonl"

distill="${data}/train/${modelname}-apps-distillation-deepseekv3.jsonl"

ftmodel="${project}/model/${modelname}-apps-distillation_bs32_epoch10"
evalresults="${data}/eval/${modelname}-apps-test-distillation-bs32-epoch10.jsonl"

# python -m codecritic.cli.distill \
#       --dataset ${trainset} \
#       --pairinfo ${train_selected_pairs} \
#       --output ${distill}

deepspeed --module \
openrlhf.cli.train_sft \
   --max_len 4096 \
   --dataset ${distill} \
   --input_key question \
   --output_key response \
   --apply_chat_template \
   --train_batch_size 32 \
   --micro_train_batch_size 1 \
   --max_samples 500000 \
   --pretrain ${model} \
   --save_path ${ftmodel} \
   --save_steps -1 \
   --logging_steps 1 \
   --eval_steps -1 \
   --zero_stage 2 \
   --max_epochs 10 \
   --bf16 \
   --flash_attn \
   --learning_rate 5e-6 \
   --gradient_checkpointing \
   --use_tensorboard "${ftmodel}_log"


python -m codecritic.cli.test_genrm \
       --model ${ftmodel} \
       --testset ${testset} \
       --output ${evalresults} \
       --reasoning \
       --tp 1
