refactor: large-scale project structure refactor (WIP)

- Reorganized project structure for better modularity and maintainability. - Note: This commit is a work in progress and is not yet functional. Further adjustments to imports and code logic are required to make the project runnable.

refactor: large-scale project structure refactor (WIP)
- Reorganized project structure for better modularity and maintainability. - Note: This commit is a work in progress and is not yet functional. Further adjustments to imports and code logic are required to make the project runnable.
a89b519a · nanziyuan · b3bf4ddf · a89b519a · a89b519a · a89b519a
Commit a89b519a authored Nov 05, 2024 by nanziyuan
18 changed files
--- a/.gitignore
+++ b/.gitignore
@@ -164,4 +164,4 @@ cython_debug/
 readme.pdf
 *.json
 *.jsonl
-test_*
\ No newline at end of file
+test/
--- a/step4_test_reward_model_client.py
+++ b/step4_test_reward_model_client.py
--- a/step4_test_sft.py
+++ b/step4_test_sft.py
--- a/step1_run.py
+++ b/step1_run.py
--- a/step2_cov_dataset.py
+++ b/step2_cov_dataset.py
--- a/step2_prepare_preference_dataset.py
+++ b/step2_prepare_preference_dataset.py
--- a/step2_sftorm_convert_preference_to_sft.py
+++ b/step2_sftorm_convert_preference_to_sft.py
--- a/step1_apps_test.py
+++ b/step1_apps_test.py
--- a/step1_evaluate_code.py
+++ b/step1_evaluate_code.py
--- a/step1_sample_apps.py
+++ b/step1_sample_apps.py
--- a/step1_sort_split_dataset.py
+++ b/step1_sort_split_dataset.py
--- a/utils_dataset.py
+++ b/utils_dataset.py
--- a/utils.py
+++ b/utils.py
--- a/utils_metric.py
+++ b/utils_metric.py
--- a/utils_vllm.py
+++ b/utils_vllm.py
--- a/readme.qmd
+++ b/readme.qmd
--- a/refs.bib
+++ b/refs.bib
--- a/step3_mk_training_config.py
+++ b/step3_mk_training_config.py
-import argparse
-from pathlib import Path
-
-
-orm_yaml = """\
-### model
-model_name_or_path: {model_path}
-
-### method
-stage: rm
-do_train: true
-finetuning_type: full
-deepspeed: {deepspeed_config_path}
-
-### dataset
-dataset: {dataset_name}
-template: deepseekcoder
-cutoff_len: 4096
-max_samples: 10000
-overwrite_cache: true
-preprocessing_num_workers: 16
-
-### output
-output_dir: {output_dir}
-logging_steps: 10
-save_steps: 100
-plot_loss: true
-overwrite_output_dir: true
-
-### train
-per_device_train_batch_size: 1
-gradient_accumulation_steps: 8
-learning_rate: 1.0e-5
-num_train_epochs: 1.0
-lr_scheduler_type: cosine
-warmup_ratio: 0.1
-bf16: true
-ddp_timeout: 180000000
-
-### eval
-val_size: 0.01
-per_device_eval_batch_size: 1
-eval_strategy: steps
-eval_steps: 500
-"""
-
-
-def mk_orm_train_config(model_path, dataset_name, output_dir, deepspeed):
-    train_str = orm_yaml.format(
-        model_path=model_path,
-        dataset_name=dataset_name,
-        output_dir=output_dir,
-        deepspeed_config_path=deepspeed,
-    )
-    return train_str
-
-
-test_yaml = """\
-model_name_or_path: {orm_model_path}
-template: deepseekcoder
-stage: rm
-"""
-
-
-def mk_orm_test_config(model_path):
-    test_str = test_yaml.format(orm_model_path=model_path)
-    return test_str
-
-
-sft_yaml = """\
-### model
-model_name_or_path: {model_path}
-
-### method
-stage: sft
-do_train: true
-finetuning_type: full
-deepspeed: {deepspeed_config_path}
-
-### dataset
-dataset: {dataset_name}
-template: deepseekcoder
-cutoff_len: 4096
-max_samples: 10000
-overwrite_cache: true
-preprocessing_num_workers: 16
-mask_history: true
-
-### output
-output_dir: {output_dir}
-logging_steps: 10
-save_steps: 500
-plot_loss: true
-overwrite_output_dir: true
-
-### train
-per_device_train_batch_size: 1
-gradient_accumulation_steps: 2
-learning_rate: 1.0e-5
-num_train_epochs: 3.0
-lr_scheduler_type: cosine
-warmup_ratio: 0.1
-bf16: true
-ddp_timeout: 180000000
-
-### eval
-val_size: 0.1
-per_device_eval_batch_size: 1
-eval_strategy: steps
-eval_steps: 500
-"""
-
-
-def mk_sft_train_config(model_path, dataset_name, output_dir, deepspeed):
-    train_str = sft_yaml.format(
-        model_path=model_path,
-        dataset_name=dataset_name,
-        output_dir=output_dir,
-        deepspeed_config_path=deepspeed,
-    )
-    return train_str
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--model", type=str)
-    parser.add_argument("--dataset", type=str)
-    parser.add_argument("--output_dir", type=str)
-    parser.add_argument("--deepspeed", type=str)
-    parser.add_argument("--type", type=str, choices=["orm", "sft"])
-    args = parser.parse_args()
-
-    output_dir = Path(args.output_dir)
-    output_dir.mkdir(parents=True, exist_ok=True)
-    model_output_dir = (output_dir / "model").absolute().as_posix()
-
-    if args.type == "sft":
-        train_yaml = mk_sft_train_config(
-            args.model,
-            args.dataset,
-            model_output_dir,
-            args.deepspeed,
-        )
-        with open(output_dir / "train.yaml", "w") as f:
-            f.write(train_yaml)
-    elif args.type == "orm":
-        train_yaml = mk_orm_train_config(
-            args.model,
-            args.dataset,
-            model_output_dir,
-            args.deepspeed,
-        )
-        with open(output_dir / "train.yaml", "w") as f:
-            f.write(train_yaml)
-        test_yaml = mk_orm_test_config(model_output_dir)
-        with open(output_dir / "test.yaml", "w") as f:
-            f.write(test_yaml)
-    else:
-        raise NotImplementedError("Unknown training")