[ci] fix: delete ckpts after saving in github runner (#272)

- As titled

[ci] fix: delete ckpts after saving in github runner (#272)
- As titled
62e23aee · Guangming Sheng · GitHub · ec82e9e5 · 62e23aee · 62e23aee
Unverified Commit 62e23aee authored Feb 14, 2025 by Guangming Sheng Committed by GitHub Feb 14, 2025
Showing with 15 additions and 11 deletions

.github/workflows/e2e_gsm8k.yml
+5 -4

.github/workflows/e2e_lora.yml
+3 -2

.github/workflows/e2e_sft.yml
+1 -0

tests/e2e/run_qwen_gsm8k_function_rm.sh
+1 -0

verl/workers/megatron_workers.py
+5 -5

No files found.
--- a/.github/workflows/e2e_gsm8k.yml
+++ b/.github/workflows/e2e_gsm8k.yml
@@ -46,6 +46,11 @@ jobs:
        run: |
          ray stop --force
          bash tests/e2e/run_qwen_gsm8k_function_rm.sh
+      - name: Running gsm8k e2e without rmpad using function rm and load ckpt from previous step
+        run: |
+          ray stop --force
+          bash tests/e2e/run_qwen_gsm8k_function_rm_no_rmpad.sh
+          rm -rf ~/ckpt/*
      - name: Running gsm8k e2e training tests on 8 L20 GPUs with rmpad using function rm (GRPO)
        run: |
          ray stop --force
@@ -54,10 +59,6 @@ jobs:
        run: |
          ray stop --force
          bash tests/e2e/run_qwen_gsm8k_function_rm_remax.sh
-      - name: Running gsm8k e2e without rmpad using function rm and load ckpt from previous step
-        run: |
-          ray stop --force
-          bash tests/e2e/run_qwen_gsm8k_function_rm_no_rmpad.sh
      - name: Running gsm8k e2e with rmpad using model rm
        run: |
          ray stop --force

--- a/.github/workflows/e2e_lora.yml
+++ b/.github/workflows/e2e_lora.yml
@@ -45,4 +45,5 @@ jobs:
      - name: Running gsm8k e2e training tests with LoRA
        run: |
          ray stop --force
-          bash tests/sft/run_sft_qwen05_peft.sh 8 $HOME/ckpts/
\ No newline at end of file
+          bash tests/sft/run_sft_qwen05_peft.sh 8 $HOME/ckpts/
+          rm -rf $HOME/ckpts/*
\ No newline at end of file
--- a/.github/workflows/e2e_sft.yml
+++ b/.github/workflows/e2e_sft.yml
@@ -58,3 +58,4 @@ jobs:
        run: |
          ray stop --force
          bash tests/sft/run_sft_qwen05_sp2_liger.sh 8 $HOME/ckpts/
+          rm -rf $HOME/ckpts/
--- a/tests/e2e/run_qwen_gsm8k_function_rm.sh
+++ b/tests/e2e/run_qwen_gsm8k_function_rm.sh
@@ -39,4 +39,5 @@ python3 -m verl.trainer.main_ppo \
    trainer.n_gpus_per_node=8 \
    trainer.nnodes=1 \
    trainer.save_freq=1 \
+    trainer.default_local_dir=$HOME/ckpt/ \
    trainer.total_training_steps=1 $@
--- a/verl/workers/megatron_workers.py
+++ b/verl/workers/megatron_workers.py
@@ -415,15 +415,15 @@ class ActorRolloutRefWorker(MegatronWorker):
        return output

    @register(dispatch_mode=Dispatch.ONE_TO_ALL)
-    def load_checkpoint(self, checkpoint_path):
+    def load_checkpoint(self, checkpoint_path, **kwargs):
        pass

    @register(dispatch_mode=Dispatch.ONE_TO_ALL)
-    def load_pretrained_model(self, checkpoint_path):
+    def load_pretrained_model(self, checkpoint_path, **kwargs):
        pass

    @register(dispatch_mode=Dispatch.ONE_TO_ALL)
-    def save_checkpoint(self, checkpoint_path):
+    def save_checkpoint(self, checkpoint_path, **kwargs):
        assert self._is_actor
        pass

@@ -590,11 +590,11 @@ class CriticWorker(MegatronWorker):
        return output

    @register(dispatch_mode=Dispatch.ONE_TO_ALL)
-    def load_checkpoint(self, checkpoint_path):
+    def load_checkpoint(self, checkpoint_path, **kwargs):
        pass

    @register(dispatch_mode=Dispatch.ONE_TO_ALL)
-    def save_checkpoint(self, checkpoint_path):
+    def save_checkpoint(self, checkpoint_path, **kwargs):
        pass