Commit 83cce64a by Yaoyu Zhu

add data preprocess script

parent a8d29994
# python examples/data_preprocess/codev.py --data_path /nfs_global/S/lvhanqi/codev_data/decontamination_sft_model_filter_0320_correct_synthesizable_r1_system_prompt_filter_codev_dataset_165k_v3.jsonl --local_dir data/codev/v1/3.1k_r1_filtered --train_size 3030 --test_size 100
# python examples/data_preprocess/codev.py --data_path /nfs_global/S/lvhanqi/codev_data/decontamination_sft_model_filter_0320_correct_synthesizable_r1_system_prompt_filter_codev_dataset_165k_v3.jsonl --local_dir data/codev/v1/continuous_reward_3.1k_r1_filtered --continuous_reward --error_ratio_threshold 0.2 --train_size 3030 --test_size 100
# python examples/data_preprocess/codev.py --data_path /nfs_global/S/lvhanqi/codev_data/sft_model_filter_0320_correct_synthesizable_r1_system_prompt_filter_codev_dataset_165k_v3.jsonl --local_dir data/codev/v1/3.4k_r1_filtered --train_size 3300 --test_size 100
# python examples/data_preprocess/codev.py --data_path /nfs_global/S/lvhanqi/codev_data/0320_correct_synthesizable_r1_system_prompt_filter_codev_dataset_165k_v3.jsonl --local_dir data/codev/v1/4.8k_r1 --train_size 4770 --test_size 100
python examples/data_preprocess/codev.py --data_path /nfs_global/S/lvhanqi/codev_data/decontamination_sft_model_filter_4.8k_and_qwen_32b_correct1234_system_prompt_codev_dataset_v3.jsonl --local_dir data/codev/v1/16k_r1_filtered --train_size 15480 --test_size 500
# python examples/data_preprocess/codev.py --data_path /nfs_global/S/lvhanqi/codev_data/decontamination_sft_model_filter_4.8k_and_qwen_32b_correct1234_system_prompt_codev_dataset_v3.jsonl --local_dir data/codev/v1/continuous_reward_16k_0.8_r1_filtered --continuous_reward --error_ratio_threshold 0.2 --train_size 15480 --test_size 500
# python examples/data_preprocess/codev.py --data_path /nfs_global/S/lvhanqi/codev_data/decontamination_sft_model_filter_4.8k_and_qwen_32b_correct1234_system_prompt_codev_dataset_v3.jsonl --local_dir data/codev/v1/continuous_reward_16k_0.0_r1_filtered --continuous_reward --error_ratio_threshold 1.0 --train_size 15480 --test_size 500
# python examples/data_preprocess/codev.py --data_path /nfs_global/S/zhangxiaoyun/CodeV-o1/data/sft/codev_o1_qwq_output.jsonl --local_dir data/codev/v1/1.6k --train_size 1600 --test_size 100
# python examples/data_preprocess/codev.py --data_path /nfs_global/S/zhangxiaoyun/CodeV-o1/data/sft/codev_o1_qwq_output.jsonl --local_dir data/codev/v1/continuous_reward_1.6k_0.8 --continuous_reward --error_ratio_threshold 0.2 --train_size 1600 --test_size 100
# python examples/data_preprocess/codev.py --data_path /nfs_global/S/zhangxiaoyun/CodeV-o1/data/sft/codev_o1_qwq_10k.jsonl --local_dir data/codev/v1/10k_qwq --train_size 9700 --test_size 500
# python examples/data_preprocess/codev.py --data_path /nfs_global/S/lvhanqi/codev_data/error_rate_l_0.2_from_87k_and_decontamination_qwen_32b_correct_1234.jsonl --local_dir data/codev/v1/continuous_reward_32k_0.8_r1_qwen --continuous_reward --error_ratio_threshold 0.2 --train_size 32000 --test_size 500
# python examples/data_preprocess/codev.py --data_path /nfs_global/S/lvhanqi/codev_data/error_rate_l_0.2_from_87k.jsonl --local_dir data/codev/v1/continuous_reward_20k_0.8_r1 --continuous_reward --error_ratio_threshold 0.2 --train_size 20000 --test_size 500
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment