Unverified Commit 62e23aee by Guangming Sheng Committed by GitHub

[ci] fix: delete ckpts after saving in github runner (#272)

- As titled
parent ec82e9e5
...@@ -46,6 +46,11 @@ jobs: ...@@ -46,6 +46,11 @@ jobs:
run: | run: |
ray stop --force ray stop --force
bash tests/e2e/run_qwen_gsm8k_function_rm.sh bash tests/e2e/run_qwen_gsm8k_function_rm.sh
- name: Running gsm8k e2e without rmpad using function rm and load ckpt from previous step
run: |
ray stop --force
bash tests/e2e/run_qwen_gsm8k_function_rm_no_rmpad.sh
rm -rf ~/ckpt/*
- name: Running gsm8k e2e training tests on 8 L20 GPUs with rmpad using function rm (GRPO) - name: Running gsm8k e2e training tests on 8 L20 GPUs with rmpad using function rm (GRPO)
run: | run: |
ray stop --force ray stop --force
...@@ -54,10 +59,6 @@ jobs: ...@@ -54,10 +59,6 @@ jobs:
run: | run: |
ray stop --force ray stop --force
bash tests/e2e/run_qwen_gsm8k_function_rm_remax.sh bash tests/e2e/run_qwen_gsm8k_function_rm_remax.sh
- name: Running gsm8k e2e without rmpad using function rm and load ckpt from previous step
run: |
ray stop --force
bash tests/e2e/run_qwen_gsm8k_function_rm_no_rmpad.sh
- name: Running gsm8k e2e with rmpad using model rm - name: Running gsm8k e2e with rmpad using model rm
run: | run: |
ray stop --force ray stop --force
......
...@@ -45,4 +45,5 @@ jobs: ...@@ -45,4 +45,5 @@ jobs:
- name: Running gsm8k e2e training tests with LoRA - name: Running gsm8k e2e training tests with LoRA
run: | run: |
ray stop --force ray stop --force
bash tests/sft/run_sft_qwen05_peft.sh 8 $HOME/ckpts/ bash tests/sft/run_sft_qwen05_peft.sh 8 $HOME/ckpts/
\ No newline at end of file rm -rf $HOME/ckpts/*
\ No newline at end of file
...@@ -58,3 +58,4 @@ jobs: ...@@ -58,3 +58,4 @@ jobs:
run: | run: |
ray stop --force ray stop --force
bash tests/sft/run_sft_qwen05_sp2_liger.sh 8 $HOME/ckpts/ bash tests/sft/run_sft_qwen05_sp2_liger.sh 8 $HOME/ckpts/
rm -rf $HOME/ckpts/
...@@ -39,4 +39,5 @@ python3 -m verl.trainer.main_ppo \ ...@@ -39,4 +39,5 @@ python3 -m verl.trainer.main_ppo \
trainer.n_gpus_per_node=8 \ trainer.n_gpus_per_node=8 \
trainer.nnodes=1 \ trainer.nnodes=1 \
trainer.save_freq=1 \ trainer.save_freq=1 \
trainer.default_local_dir=$HOME/ckpt/ \
trainer.total_training_steps=1 $@ trainer.total_training_steps=1 $@
...@@ -415,15 +415,15 @@ class ActorRolloutRefWorker(MegatronWorker): ...@@ -415,15 +415,15 @@ class ActorRolloutRefWorker(MegatronWorker):
return output return output
@register(dispatch_mode=Dispatch.ONE_TO_ALL) @register(dispatch_mode=Dispatch.ONE_TO_ALL)
def load_checkpoint(self, checkpoint_path): def load_checkpoint(self, checkpoint_path, **kwargs):
pass pass
@register(dispatch_mode=Dispatch.ONE_TO_ALL) @register(dispatch_mode=Dispatch.ONE_TO_ALL)
def load_pretrained_model(self, checkpoint_path): def load_pretrained_model(self, checkpoint_path, **kwargs):
pass pass
@register(dispatch_mode=Dispatch.ONE_TO_ALL) @register(dispatch_mode=Dispatch.ONE_TO_ALL)
def save_checkpoint(self, checkpoint_path): def save_checkpoint(self, checkpoint_path, **kwargs):
assert self._is_actor assert self._is_actor
pass pass
...@@ -590,11 +590,11 @@ class CriticWorker(MegatronWorker): ...@@ -590,11 +590,11 @@ class CriticWorker(MegatronWorker):
return output return output
@register(dispatch_mode=Dispatch.ONE_TO_ALL) @register(dispatch_mode=Dispatch.ONE_TO_ALL)
def load_checkpoint(self, checkpoint_path): def load_checkpoint(self, checkpoint_path, **kwargs):
pass pass
@register(dispatch_mode=Dispatch.ONE_TO_ALL) @register(dispatch_mode=Dispatch.ONE_TO_ALL)
def save_checkpoint(self, checkpoint_path): def save_checkpoint(self, checkpoint_path, **kwargs):
pass pass
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment