diff --git a/pyproject.toml b/pyproject.toml index 8f7f6b2de..8c87448f8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -52,6 +52,14 @@ fms-accel = ["fms-acceleration>=0.6.2"] gptq-dev = ["auto_gptq>0.4.2", "optimum>=1.15.0"] mamba = ["mamba_ssm[causal-conv1d]>=2.0.0,<3.0.0"] scanner-dev = ["HFResourceScanner>=0.1.0"] +fms-accel-all = [ + "fms-acceleration>=0.6.2", + "fms-acceleration-peft", + "fms-acceleration-foak", + "fms-acceleration-aadp", + "fms-acceleration-moe", + "fms-acceleration-odm" +] [tool.setuptools.packages.find] exclude = ["tests", "tests.*"] diff --git a/tests/acceleration/test_acceleration_framework.py b/tests/acceleration/test_acceleration_framework.py index 757d9fa00..add77c730 100644 --- a/tests/acceleration/test_acceleration_framework.py +++ b/tests/acceleration/test_acceleration_framework.py @@ -263,10 +263,10 @@ def test_framework_raises_if_used_with_missing_package(): ValueError, match="No acceleration framework package found." ): sft_trainer.train( - MODEL_ARGS, - DATA_ARGS, - TRAIN_ARGS, - PEFT_LORA_ARGS, + copy.deepcopy(MODEL_ARGS), + copy.deepcopy(DATA_ARGS), + copy.deepcopy(TRAIN_ARGS), + copy.deepcopy(PEFT_LORA_ARGS), quantized_lora_config=quantized_lora_config, ) @@ -320,9 +320,9 @@ def test_framework_raises_due_to_invalid_arguments( with pytest.raises(exception, match=exception_msg): sft_trainer.train( model_args, - DATA_ARGS, + copy.deepcopy(DATA_ARGS), train_args, - peft_config, + copy.deepcopy(peft_config), quantized_lora_config=quantized_lora_config, ) @@ -379,7 +379,7 @@ def test_framework_initialized_properly_peft( train_args = copy.deepcopy(TRAIN_ARGS) train_args.output_dir = tempdir train_args.save_strategy = "no" - train_args.fp16 = True + train_args.bf16 = True peft_args = copy.deepcopy(PEFT_LORA_ARGS) peft_args.target_modules = ["q_proj", "k_proj"] @@ -395,7 +395,7 @@ def test_framework_initialized_properly_peft( with instantiate_model_patcher(): sft_trainer.train( model_args, - DATA_ARGS, + copy.deepcopy(DATA_ARGS), train_args, peft_args, quantized_lora_config=quantized_lora_config, @@ -430,7 +430,7 @@ def test_framework_initialized_properly_foak(): train_args = copy.deepcopy(TRAIN_ARGS) train_args.output_dir = tempdir train_args.save_strategy = "no" - train_args.fp16 = True + train_args.bf16 = True peft_args = copy.deepcopy(PEFT_LORA_ARGS) peft_args.target_modules = ["q_proj", "k_proj"] @@ -465,7 +465,7 @@ def test_framework_initialized_properly_foak(): with instantiate_model_patcher(): sft_trainer.train( model_args, - DATA_ARGS, + copy.deepcopy(DATA_ARGS), train_args, peft_args, quantized_lora_config=quantized_lora_config, @@ -613,8 +613,8 @@ def test_error_raised_with_paddingfree_and_flash_attn_disabled(): model_args.use_flash_attn = False sft_trainer.train( model_args, - DATA_ARGS, - TRAIN_ARGS, + copy.deepcopy(DATA_ARGS), + copy.deepcopy(TRAIN_ARGS), attention_and_distributed_packing_config=attention_and_distributed_packing_config, ) @@ -637,8 +637,8 @@ def test_error_raised_with_multipack_and_paddingfree_disabled(): model_args = copy.deepcopy(MODEL_ARGS) sft_trainer.train( model_args, - DATA_ARGS, - TRAIN_ARGS, + copy.deepcopy(DATA_ARGS), + copy.deepcopy(TRAIN_ARGS), attention_and_distributed_packing_config=attention_and_distributed_packing_config, ) @@ -664,7 +664,7 @@ def test_error_raised_with_packing_and_paddingfree_enabled(): train_args.packing = True sft_trainer.train( model_args, - DATA_ARGS, + copy.deepcopy(DATA_ARGS), train_args, attention_and_distributed_packing_config=attention_and_distributed_packing_config, ) @@ -693,7 +693,6 @@ def test_error_raised_with_fused_lora_enabled_without_quantized_argument(): train_args = copy.deepcopy(TRAIN_ARGS) train_args.output_dir = tempdir train_args.save_strategy = "no" - train_args.fp16 = True peft_args = copy.deepcopy(PEFT_LORA_ARGS) peft_args.target_modules = ["q_proj", "k_proj"] @@ -713,7 +712,7 @@ def test_error_raised_with_fused_lora_enabled_without_quantized_argument(): with instantiate_model_patcher(): sft_trainer.train( model_args, - DATA_ARGS, + copy.deepcopy(DATA_ARGS), train_args, peft_args, quantized_lora_config=None, diff --git a/tests/artifacts/language_models/__init__.py b/tests/artifacts/language_models/__init__.py index 891456402..7be49dc1b 100644 --- a/tests/artifacts/language_models/__init__.py +++ b/tests/artifacts/language_models/__init__.py @@ -20,3 +20,4 @@ ### Constants used for model path PREDEFINED_MODEL_PATH = os.path.join(os.path.dirname(__file__)) MAYKEYE_TINY_LLAMA_CACHED = os.path.join(PREDEFINED_MODEL_PATH, "maykeye-tinyllama-v0") +TINYMIXTRAL_MOE = "Isotonic/TinyMixtral-4x248M-MoE" diff --git a/tests/build/test_launch_script.py b/tests/build/test_launch_script.py index c3d5b9b6a..322fe5998 100644 --- a/tests/build/test_launch_script.py +++ b/tests/build/test_launch_script.py @@ -81,25 +81,19 @@ } -def setup_env(tempdir): - os.environ["TRAINING_SCRIPT"] = SCRIPT - os.environ["PYTHONPATH"] = "./:$PYTHONPATH" - os.environ["TERMINATION_LOG_FILE"] = tempdir + "/termination-log" +def setup_env(monkeypatch, tempdir): + monkeypatch.setenv("TRAINING_SCRIPT", SCRIPT) + monkeypatch.setenv("PYTHONPATH", "./:$PYTHONPATH") + monkeypatch.setenv("TERMINATION_LOG_FILE", os.path.join(tempdir, "termination-log")) -def cleanup_env(): - os.environ.pop("TRAINING_SCRIPT", None) - os.environ.pop("PYTHONPATH", None) - os.environ.pop("TERMINATION_LOG_FILE", None) - - -def test_successful_ft(): +def test_successful_ft(monkeypatch): """Check if we can bootstrap and fine tune causallm models""" with tempfile.TemporaryDirectory() as tempdir: - setup_env(tempdir) + setup_env(monkeypatch, tempdir) TRAIN_KWARGS = {**BASE_KWARGS, **{"output_dir": tempdir}} serialized_args = serialize_args(TRAIN_KWARGS) - os.environ["SFT_TRAINER_CONFIG_JSON_ENV_VAR"] = serialized_args + monkeypatch.setenv("SFT_TRAINER_CONFIG_JSON_ENV_VAR", serialized_args) assert main() == 0 _validate_termination_files_when_tuning_succeeds(tempdir) @@ -108,13 +102,13 @@ def test_successful_ft(): @pytest.mark.skipif(True, reason="This test is deprecated so always skipped") -def test_successful_pt(): +def test_successful_pt(monkeypatch): """Check if we can bootstrap and peft tune causallm models""" with tempfile.TemporaryDirectory() as tempdir: - setup_env(tempdir) + setup_env(monkeypatch, tempdir) TRAIN_KWARGS = {**BASE_PEFT_KWARGS, **{"output_dir": tempdir}} serialized_args = serialize_args(TRAIN_KWARGS) - os.environ["SFT_TRAINER_CONFIG_JSON_ENV_VAR"] = serialized_args + monkeypatch.setenv("SFT_TRAINER_CONFIG_JSON_ENV_VAR", serialized_args) assert main() == 0 _validate_termination_files_when_tuning_succeeds(tempdir) @@ -122,13 +116,13 @@ def test_successful_pt(): _validate_training_output(checkpoint, "pt") -def test_successful_lora(): +def test_successful_lora(monkeypatch): """Check if we can bootstrap and LoRA tune causallm models""" with tempfile.TemporaryDirectory() as tempdir: - setup_env(tempdir) + setup_env(monkeypatch, tempdir) TRAIN_KWARGS = {**BASE_LORA_KWARGS, **{"output_dir": tempdir}} serialized_args = serialize_args(TRAIN_KWARGS) - os.environ["SFT_TRAINER_CONFIG_JSON_ENV_VAR"] = serialized_args + monkeypatch.setenv("SFT_TRAINER_CONFIG_JSON_ENV_VAR", serialized_args) assert main() == 0 _validate_termination_files_when_tuning_succeeds(tempdir) @@ -136,7 +130,7 @@ def test_successful_lora(): _validate_training_output(checkpoint, "lora") -def test_lora_save_model_dir_separate_dirs(): +def test_lora_save_model_dir_separate_dirs(monkeypatch): """Run LoRA tuning with separate save_model_dir and output_dir. Verify model saved to save_model_dir and checkpoints saved to output_dir. @@ -144,7 +138,7 @@ def test_lora_save_model_dir_separate_dirs(): with tempfile.TemporaryDirectory() as tempdir: output_dir = os.path.join(tempdir, "output_dir") save_model_dir = os.path.join(tempdir, "save_model_dir") - setup_env(tempdir) + setup_env(monkeypatch, tempdir) TRAIN_KWARGS = { **BASE_LORA_KWARGS, **{ @@ -154,7 +148,7 @@ def test_lora_save_model_dir_separate_dirs(): }, } serialized_args = serialize_args(TRAIN_KWARGS) - os.environ["SFT_TRAINER_CONFIG_JSON_ENV_VAR"] = serialized_args + monkeypatch.setenv("SFT_TRAINER_CONFIG_JSON_ENV_VAR", serialized_args) assert main() == 0 _validate_termination_files_when_tuning_succeeds(output_dir) @@ -165,12 +159,12 @@ def test_lora_save_model_dir_separate_dirs(): assert len(checkpoints) == 1 -def test_lora_save_model_dir_same_dir_as_output_dir(): +def test_lora_save_model_dir_same_dir_as_output_dir(monkeypatch): """Run LoRA tuning with same save_model_dir and output_dir. Verify checkpoints, logs, and model saved to path. """ with tempfile.TemporaryDirectory() as tempdir: - setup_env(tempdir) + setup_env(monkeypatch, tempdir) TRAIN_KWARGS = { **BASE_LORA_KWARGS, **{ @@ -180,7 +174,7 @@ def test_lora_save_model_dir_same_dir_as_output_dir(): }, } serialized_args = serialize_args(TRAIN_KWARGS) - os.environ["SFT_TRAINER_CONFIG_JSON_ENV_VAR"] = serialized_args + monkeypatch.setenv("SFT_TRAINER_CONFIG_JSON_ENV_VAR", serialized_args) assert main() == 0 # check logs, checkpoint dir, and model exists in path @@ -195,19 +189,21 @@ def test_lora_save_model_dir_same_dir_as_output_dir(): assert len(checkpoints) == TRAIN_KWARGS["num_train_epochs"] -def test_lora_save_model_dir_same_dir_as_output_dir_save_strategy_no(): +def test_lora_save_model_dir_same_dir_as_output_dir_save_strategy_no( + monkeypatch, +): """Run LoRA tuning with same save_model_dir and output_dir and save_strategy=no. Verify no checkpoints created, only logs and final model. """ with tempfile.TemporaryDirectory() as tempdir: - setup_env(tempdir) + setup_env(monkeypatch, tempdir) TRAIN_KWARGS = { **BASE_LORA_KWARGS, **{"output_dir": tempdir, "save_model_dir": tempdir, "save_strategy": "no"}, } serialized_args = serialize_args(TRAIN_KWARGS) - os.environ["SFT_TRAINER_CONFIG_JSON_ENV_VAR"] = serialized_args + monkeypatch.setenv("SFT_TRAINER_CONFIG_JSON_ENV_VAR", serialized_args) assert main() == 0 # check that model and logs exists in output_dir @@ -219,9 +215,9 @@ def test_lora_save_model_dir_same_dir_as_output_dir_save_strategy_no(): assert len(checkpoints) == 0 -def test_lora_with_lora_post_process_for_vllm_set_to_true(): +def test_lora_with_lora_post_process_for_vllm_set_to_true(monkeypatch): with tempfile.TemporaryDirectory() as tempdir: - setup_env(tempdir) + setup_env(monkeypatch, tempdir) TRAIN_KWARGS = { **BASE_LORA_KWARGS, **{ @@ -231,7 +227,7 @@ def test_lora_with_lora_post_process_for_vllm_set_to_true(): }, } serialized_args = serialize_args(TRAIN_KWARGS) - os.environ["SFT_TRAINER_CONFIG_JSON_ENV_VAR"] = serialized_args + monkeypatch.setenv("SFT_TRAINER_CONFIG_JSON_ENV_VAR", serialized_args) assert main() == 0 # check that model and logs exists in output_dir @@ -255,9 +251,9 @@ def test_lora_with_lora_post_process_for_vllm_set_to_true(): not _is_package_available("HFResourceScanner"), reason="Only runs if HFResourceScanner is installed", ) -def test_launch_with_HFResourceScanner_enabled(): +def test_launch_with_HFResourceScanner_enabled(monkeypatch): with tempfile.TemporaryDirectory() as tempdir: - setup_env(tempdir) + setup_env(monkeypatch, tempdir) scanner_outfile = os.path.join(tempdir, TrackerConfigs.scanner_output_filename) TRAIN_KWARGS = { **BASE_LORA_KWARGS, @@ -271,7 +267,7 @@ def test_launch_with_HFResourceScanner_enabled(): }, } serialized_args = serialize_args(TRAIN_KWARGS) - os.environ["SFT_TRAINER_CONFIG_JSON_ENV_VAR"] = serialized_args + monkeypatch.setenv("SFT_TRAINER_CONFIG_JSON_ENV_VAR", serialized_args) assert main() == 0 assert os.path.exists(scanner_outfile) is True @@ -281,14 +277,14 @@ def test_launch_with_HFResourceScanner_enabled(): assert scanner_res["mem_data"] is not None -def test_bad_script_path(): +def test_bad_script_path(monkeypatch): """Check for appropriate error for an invalid training script location""" with tempfile.TemporaryDirectory() as tempdir: - setup_env(tempdir) + setup_env(monkeypatch, tempdir) TRAIN_KWARGS = {**BASE_LORA_KWARGS, **{"output_dir": tempdir}} serialized_args = serialize_args(TRAIN_KWARGS) - os.environ["SFT_TRAINER_CONFIG_JSON_ENV_VAR"] = serialized_args - os.environ["TRAINING_SCRIPT"] = "/not/here" + monkeypatch.setenv("SFT_TRAINER_CONFIG_JSON_ENV_VAR", serialized_args) + monkeypatch.setenv("TRAINING_SCRIPT", "/not/here") with pytest.raises(SystemExit) as pytest_wrapped_e: main() @@ -297,10 +293,10 @@ def test_bad_script_path(): assert os.stat(tempdir + "/termination-log").st_size > 0 -def test_blank_env_var(): +def test_blank_env_var(monkeypatch): with tempfile.TemporaryDirectory() as tempdir: - setup_env(tempdir) - os.environ["SFT_TRAINER_CONFIG_JSON_ENV_VAR"] = "" + setup_env(monkeypatch, tempdir) + monkeypatch.setenv("SFT_TRAINER_CONFIG_JSON_ENV_VAR", "") with pytest.raises(SystemExit) as pytest_wrapped_e: main() assert pytest_wrapped_e.type == SystemExit @@ -308,16 +304,16 @@ def test_blank_env_var(): assert os.stat(tempdir + "/termination-log").st_size > 0 -def test_faulty_file_path(): +def test_faulty_file_path(monkeypatch): with tempfile.TemporaryDirectory() as tempdir: - setup_env(tempdir) + setup_env(monkeypatch, tempdir) faulty_path = os.path.join(tempdir, "non_existent_file.pkl") TRAIN_KWARGS = { **BASE_LORA_KWARGS, **{"training_data_path": faulty_path, "output_dir": tempdir}, } serialized_args = serialize_args(TRAIN_KWARGS) - os.environ["SFT_TRAINER_CONFIG_JSON_ENV_VAR"] = serialized_args + monkeypatch.setenv("SFT_TRAINER_CONFIG_JSON_ENV_VAR", serialized_args) with pytest.raises(SystemExit) as pytest_wrapped_e: main() assert pytest_wrapped_e.type == SystemExit @@ -325,16 +321,16 @@ def test_faulty_file_path(): assert os.stat(tempdir + "/termination-log").st_size > 0 -def test_bad_base_model_path(): +def test_bad_base_model_path(monkeypatch): with tempfile.TemporaryDirectory() as tempdir: - setup_env(tempdir) + setup_env(monkeypatch, tempdir) TRAIN_KWARGS = { **BASE_LORA_KWARGS, **{"model_name_or_path": "/wrong/path"}, "output_dir": tempdir, } serialized_args = serialize_args(TRAIN_KWARGS) - os.environ["SFT_TRAINER_CONFIG_JSON_ENV_VAR"] = serialized_args + monkeypatch.setenv("SFT_TRAINER_CONFIG_JSON_ENV_VAR", serialized_args) with pytest.raises(SystemExit) as pytest_wrapped_e: main() assert pytest_wrapped_e.type == SystemExit @@ -342,16 +338,16 @@ def test_bad_base_model_path(): assert os.stat(tempdir + "/termination-log").st_size > 0 -def test_config_parsing_error(): +def test_config_parsing_error(monkeypatch): with tempfile.TemporaryDirectory() as tempdir: - setup_env(tempdir) + setup_env(monkeypatch, tempdir) TRAIN_KWARGS = { **BASE_LORA_KWARGS, **{"num_train_epochs": "five"}, "output_dir": tempdir, } # Intentional type error serialized_args = serialize_args(TRAIN_KWARGS) - os.environ["SFT_TRAINER_CONFIG_JSON_ENV_VAR"] = serialized_args + monkeypatch.setenv("SFT_TRAINER_CONFIG_JSON_ENV_VAR", serialized_args) with pytest.raises(SystemExit) as pytest_wrapped_e: main() assert pytest_wrapped_e.type == SystemExit @@ -376,9 +372,3 @@ def _validate_training_output(base_dir, tuning_technique): else: assert os.path.exists(base_dir + "/adapter_config.json") is True assert os.path.exists(base_dir + "/adapter_model.safetensors") is True - - -def test_cleanup(): - # This runs to unset env variables that could disrupt other tests - cleanup_env() - assert True diff --git a/tests/build/test_utils.py b/tests/build/test_utils.py index 4ad228879..abf372239 100644 --- a/tests/build/test_utils.py +++ b/tests/build/test_utils.py @@ -55,9 +55,15 @@ def test_process_accelerate_launch_args(job_config): @patch("torch.cuda.device_count", return_value=1) -def test_accelerate_launch_args_user_set_num_processes_ignored(job_config): +def test_accelerate_launch_args_user_set_num_processes_ignored( + _mock_cuda_count, job_config, monkeypatch +): job_config_copy = copy.deepcopy(job_config) job_config_copy["accelerate_launch_args"]["num_processes"] = "3" + if "CUDA_VISIBLE_DEVICES" in os.environ: + monkeypatch.setenv("CUDA_VISIBLE_DEVICES", os.environ["CUDA_VISIBLE_DEVICES"]) + else: + monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False) args = process_accelerate_launch_args(job_config_copy) # determine number of processes by number of GPUs available assert args.num_processes == 1 @@ -66,8 +72,8 @@ def test_accelerate_launch_args_user_set_num_processes_ignored(job_config): assert os.getenv("CUDA_VISIBLE_DEVICES") == "0" -@patch.dict(os.environ, {"SET_NUM_PROCESSES_TO_NUM_GPUS": "False"}) -def test_accelerate_launch_args_user_set_num_processes(job_config): +def test_accelerate_launch_args_user_set_num_processes(job_config, monkeypatch): + monkeypatch.setenv("SET_NUM_PROCESSES_TO_NUM_GPUS", "False") job_config_copy = copy.deepcopy(job_config) job_config_copy["accelerate_launch_args"]["num_processes"] = "3" @@ -94,7 +100,10 @@ def test_accelerate_launch_args_default_fsdp_config_multigpu(job_config): @patch("os.path.exists") -def test_process_accelerate_launch_custom_config_file(patch_path_exists): +def test_process_accelerate_launch_custom_config_file(patch_path_exists, monkeypatch): + monkeypatch.setenv("SET_NUM_PROCESSES_TO_NUM_GPUS", "False") + monkeypatch.setenv("CUDA_VISIBLE_DEVICES", "") + patch_path_exists.return_value = True dummy_config_path = "dummy_fsdp_config.yaml" diff --git a/tests/test_sft_trainer.py b/tests/test_sft_trainer.py index 670913574..7f3d402c1 100644 --- a/tests/test_sft_trainer.py +++ b/tests/test_sft_trainer.py @@ -39,7 +39,7 @@ # First Party from build.utils import serialize_args from scripts.run_inference import TunedCausalLM -from tests.artifacts.language_models import MAYKEYE_TINY_LLAMA_CACHED +from tests.artifacts.language_models import MAYKEYE_TINY_LLAMA_CACHED, TINYMIXTRAL_MOE from tests.artifacts.predefined_data_configs import ( CHAT_TEMPLATE_JINJA, DATA_CONFIG_DUPLICATE_COLUMNS, @@ -174,7 +174,9 @@ def test_resume_training_from_checkpoint(enable_reduce_loss_sum): train_args.enable_reduce_loss_sum = enable_reduce_loss_sum train_args.output_dir = tempdir - sft_trainer.train(MODEL_ARGS, DATA_ARGS, train_args, None) + sft_trainer.train( + copy.deepcopy(MODEL_ARGS), copy.deepcopy(DATA_ARGS), train_args, None + ) _validate_training(tempdir) # Get trainer state of latest checkpoint @@ -183,7 +185,9 @@ def test_resume_training_from_checkpoint(enable_reduce_loss_sum): # Resume training with higher epoch and same output dir train_args.num_train_epochs += 5 - sft_trainer.train(MODEL_ARGS, DATA_ARGS, train_args, None) + sft_trainer.train( + copy.deepcopy(MODEL_ARGS), copy.deepcopy(DATA_ARGS), train_args, None + ) _validate_training(tempdir) # Get trainer state of latest checkpoint @@ -217,7 +221,9 @@ def test_resume_training_from_checkpoint_with_flag_true(): train_args.output_dir = tempdir train_args.resume_from_checkpoint = "True" - sft_trainer.train(MODEL_ARGS, DATA_ARGS, train_args, None) + sft_trainer.train( + copy.deepcopy(MODEL_ARGS), copy.deepcopy(DATA_ARGS), train_args, None + ) _validate_training(tempdir) # Get trainer state of latest checkpoint @@ -229,7 +235,9 @@ def test_resume_training_from_checkpoint_with_flag_true(): # Resume training with higher epoch and same output dir train_args.num_train_epochs += 5 - sft_trainer.train(MODEL_ARGS, DATA_ARGS, train_args, None) + sft_trainer.train( + copy.deepcopy(MODEL_ARGS), copy.deepcopy(DATA_ARGS), train_args, None + ) _validate_training(tempdir) # Get trainer state of latest checkpoint @@ -256,7 +264,9 @@ def test_resume_training_from_checkpoint_with_flag_false(): train_args.output_dir = tempdir train_args.resume_from_checkpoint = "False" - sft_trainer.train(MODEL_ARGS, DATA_ARGS, train_args, None) + sft_trainer.train( + copy.deepcopy(MODEL_ARGS), copy.deepcopy(DATA_ARGS), train_args, None + ) _validate_training(tempdir) # Get trainer state of latest checkpoint @@ -269,7 +279,9 @@ def test_resume_training_from_checkpoint_with_flag_false(): # Training again with higher epoch and same output dir train_args.num_train_epochs += 5 - sft_trainer.train(MODEL_ARGS, DATA_ARGS, train_args, None) + sft_trainer.train( + copy.deepcopy(MODEL_ARGS), copy.deepcopy(DATA_ARGS), train_args, None + ) _validate_training(tempdir) # Get Training log entry for epoch 1 @@ -286,7 +298,9 @@ def test_resume_training_from_checkpoint_with_flag_checkpoint_path_lora(): lora_config = copy.deepcopy(PEFT_LORA_ARGS) train_args.output_dir = tempdir - sft_trainer.train(MODEL_ARGS, DATA_ARGS, train_args, lora_config) + sft_trainer.train( + copy.deepcopy(MODEL_ARGS), copy.deepcopy(DATA_ARGS), train_args, lora_config + ) _validate_training(tempdir) # Get trainer state and checkpoint_path of second last checkpoint @@ -298,7 +312,9 @@ def test_resume_training_from_checkpoint_with_flag_checkpoint_path_lora(): # Resume training with higher epoch and same output dir train_args.num_train_epochs += 5 train_args.resume_from_checkpoint = checkpoint_path - sft_trainer.train(MODEL_ARGS, DATA_ARGS, train_args, lora_config) + sft_trainer.train( + copy.deepcopy(MODEL_ARGS), copy.deepcopy(DATA_ARGS), train_args, lora_config + ) _validate_training(tempdir) # Get total_flos from trainer state of checkpoint_path and check if its same @@ -378,7 +394,12 @@ def test_run_train_fails_training_data_path_not_exist(): updated_data_path_args = copy.deepcopy(DATA_ARGS) updated_data_path_args.training_data_path = "fake/path" with pytest.raises(DatasetNotFoundError): - sft_trainer.train(MODEL_ARGS, updated_data_path_args, TRAIN_ARGS, None) + sft_trainer.train( + copy.deepcopy(MODEL_ARGS), + updated_data_path_args, + copy.deepcopy(TRAIN_ARGS), + None, + ) HAPPY_PATH_DUMMY_CONFIG_PATH = os.path.join( @@ -497,7 +518,12 @@ def test_run_causallm_pt_and_inference(): train_args = copy.deepcopy(TRAIN_ARGS) train_args.output_dir = tempdir - sft_trainer.train(MODEL_ARGS, DATA_ARGS, train_args, PEFT_PT_ARGS) + sft_trainer.train( + copy.deepcopy(MODEL_ARGS), + copy.deepcopy(DATA_ARGS), + train_args, + copy.deepcopy(PEFT_PT_ARGS), + ) # validate peft tuning configs _validate_training(tempdir) @@ -534,7 +560,12 @@ def test_run_causallm_pt_and_inference_with_formatting_data(): train_args = copy.deepcopy(TRAIN_ARGS) train_args.output_dir = tempdir - sft_trainer.train(MODEL_ARGS, data_args, train_args, PEFT_PT_ARGS) + sft_trainer.train( + copy.deepcopy(MODEL_ARGS), + data_args, + train_args, + copy.deepcopy(PEFT_PT_ARGS), + ) # validate peft tuning configs _validate_training(tempdir) @@ -568,7 +599,12 @@ def test_run_causallm_pt_and_inference_JSON_file_formatter(): "### Text: {{element['Tweet text']}} \n\n### Label: {{text_label}}" ) - sft_trainer.train(MODEL_ARGS, data_args, train_args, PEFT_PT_ARGS) + sft_trainer.train( + copy.deepcopy(MODEL_ARGS), + data_args, + train_args, + copy.deepcopy(PEFT_PT_ARGS), + ) # validate peft tuning configs _validate_training(tempdir) @@ -602,7 +638,12 @@ def test_run_causallm_pt_init_text(): num_virtual_tokens=0, ) - sft_trainer.train(MODEL_ARGS, DATA_ARGS, train_args, tuning_config) + sft_trainer.train( + copy.deepcopy(MODEL_ARGS), + copy.deepcopy(DATA_ARGS), + train_args, + tuning_config, + ) # validate peft tuning configs _validate_training(tempdir) @@ -637,7 +678,10 @@ def test_run_causallm_lora_invalid_train_params(param_name, param_val, exc_msg): with pytest.raises(ValueError, match=exc_msg): sft_trainer.train( - MODEL_ARGS, DATA_ARGS, invalid_params, copy.deepcopy(PEFT_LORA_ARGS) + copy.deepcopy(MODEL_ARGS), + copy.deepcopy(DATA_ARGS), + invalid_params, + copy.deepcopy(PEFT_LORA_ARGS), ) @@ -655,7 +699,10 @@ def test_run_causallm_lora_with_validation(dataset_path): data_args.validation_data_path = dataset_path sft_trainer.train( - MODEL_ARGS, data_args, train_args, copy.deepcopy(PEFT_LORA_ARGS) + copy.deepcopy(MODEL_ARGS), + data_args, + train_args, + copy.deepcopy(PEFT_LORA_ARGS), ) _validate_training(tempdir, check_eval=True) @@ -678,7 +725,10 @@ def test_run_causallm_lora_with_validation_data_formatting(dataset_path): ) sft_trainer.train( - MODEL_ARGS, data_args, train_args, copy.deepcopy(PEFT_LORA_ARGS) + copy.deepcopy(MODEL_ARGS), + data_args, + train_args, + copy.deepcopy(PEFT_LORA_ARGS), ) _validate_training(tempdir, check_eval=True) @@ -699,7 +749,9 @@ def test_run_causallm_pt_with_custom_tokenizer(dataset_path): data_args = copy.deepcopy(DATA_ARGS) data_args.validation_data_path = dataset_path with pytest.raises(ValueError): - sft_trainer.train(model_args, data_args, train_args, PEFT_PT_ARGS) + sft_trainer.train( + model_args, data_args, train_args, copy.deepcopy(PEFT_PT_ARGS) + ) ############################# Lora Tests ############################# @@ -731,7 +783,12 @@ def test_run_causallm_lora_and_inference(request, target_modules, expected): if "default" not in request._pyfuncitem.callspec.id: base_lora_args.target_modules = target_modules - sft_trainer.train(MODEL_ARGS, DATA_ARGS, train_args, base_lora_args) + sft_trainer.train( + copy.deepcopy(MODEL_ARGS), + copy.deepcopy(DATA_ARGS), + train_args, + base_lora_args, + ) # validate lora tuning configs _validate_training(tempdir) @@ -775,7 +832,12 @@ def test_run_causallm_alora_and_inference(request, target_modules, expected): if "default" not in request._pyfuncitem.callspec.id: base_alora_args.target_modules = target_modules - sft_trainer.train(MODEL_ARGS, DATA_ARGS, train_args, base_alora_args) + sft_trainer.train( + copy.deepcopy(MODEL_ARGS), + copy.deepcopy(DATA_ARGS), + train_args, + base_alora_args, + ) # validate lora tuning configs _validate_training(tempdir) @@ -801,7 +863,7 @@ def test_run_causallm_alora_and_inference(request, target_modules, expected): assert "Simply put, the theory of relativity states that \n" in output_inference -def test_successful_lora_target_modules_default_from_main(): +def test_successful_lora_target_modules_default_from_main(monkeypatch): """Check that if target_modules is not set, or set to None via JSON, the default value by model type will be using in LoRA tuning. The correct default target modules will be used for model type llama @@ -818,7 +880,7 @@ def test_successful_lora_target_modules_default_from_main(): **{"peft_method": "lora", "output_dir": tempdir}, } serialized_args = serialize_args(TRAIN_KWARGS) - os.environ["SFT_TRAINER_CONFIG_JSON_ENV_VAR"] = serialized_args + monkeypatch.setenv("SFT_TRAINER_CONFIG_JSON_ENV_VAR", serialized_args) sft_trainer.main() @@ -859,7 +921,9 @@ def test_run_causallm_lora_add_special_tokens(): "<|test_token_3|>", ] - sft_trainer.train(MODEL_ARGS, data_args, train_args, base_lora_args) + sft_trainer.train( + copy.deepcopy(MODEL_ARGS), data_args, train_args, base_lora_args + ) # validate lora tuning configs _validate_training(tempdir) @@ -909,7 +973,12 @@ def test_run_causallm_lora_tied_weights_in_modules_to_save(modules_to_save, expe base_lora_args.target_modules = ["q_proj"] base_lora_args.modules_to_save = modules_to_save - sft_trainer.train(MODEL_ARGS, DATA_ARGS, train_args, base_lora_args) + sft_trainer.train( + copy.deepcopy(MODEL_ARGS), + copy.deepcopy(DATA_ARGS), + train_args, + base_lora_args, + ) # validate lora tuning configs _validate_training(tempdir) @@ -953,7 +1022,12 @@ def test_run_causallm_lora_tied_weights_in_target_modules(target_modules, expect base_lora_args = copy.deepcopy(PEFT_LORA_ARGS) base_lora_args.target_modules = target_modules - sft_trainer.train(MODEL_ARGS, DATA_ARGS, train_args, base_lora_args) + sft_trainer.train( + copy.deepcopy(MODEL_ARGS), + copy.deepcopy(DATA_ARGS), + train_args, + base_lora_args, + ) # validate lora tuning configs _validate_training(tempdir) @@ -1016,7 +1090,9 @@ def test_run_causallm_ft_save_with_save_model_dir_save_strategy_no(): save_model_args.save_strategy = "no" save_model_args.output_dir = tempdir - trainer, _, _ = sft_trainer.train(MODEL_ARGS, DATA_ARGS, save_model_args, None) + trainer, _, _ = sft_trainer.train( + copy.deepcopy(MODEL_ARGS), copy.deepcopy(DATA_ARGS), save_model_args, None + ) logs_path = os.path.join(tempdir, TrackerConfigs.training_logs_filename) _validate_logfile(logs_path) # validate that no checkpoints created @@ -1054,7 +1130,7 @@ def test_run_causallm_ft_pretokenized(dataset_path, packing): train_args.packing = packing train_args.max_seq_length = 256 - sft_trainer.train(MODEL_ARGS, data_args, train_args) + sft_trainer.train(copy.deepcopy(MODEL_ARGS), data_args, train_args) # validate full ft configs _validate_training(tempdir) @@ -1121,7 +1197,7 @@ def test_run_causallm_ft_and_inference_streaming(datasetconfigname, datafiles): train_args.output_dir = tempdir train_args.max_steps = 1 - sft_trainer.train(MODEL_ARGS, data_formatting_args, train_args) + sft_trainer.train(copy.deepcopy(MODEL_ARGS), data_formatting_args, train_args) # validate full ft configs _validate_training(tempdir) @@ -1212,7 +1288,7 @@ def test_run_causallm_ft_and_inference_with_multiple_dataset( train_args = copy.deepcopy(TRAIN_ARGS) train_args.output_dir = tempdir - sft_trainer.train(MODEL_ARGS, data_formatting_args, train_args) + sft_trainer.train(copy.deepcopy(MODEL_ARGS), data_formatting_args, train_args) # validate full ft configs _validate_training(tempdir) @@ -1258,7 +1334,7 @@ def test_run_training_with_pretokenised_dataset_containing_input_ids(): train_args = copy.deepcopy(TRAIN_ARGS) train_args.output_dir = tempdir - sft_trainer.train(MODEL_ARGS, data_args, train_args) + sft_trainer.train(copy.deepcopy(MODEL_ARGS), data_args, train_args) # validate full ft configs _validate_training(tempdir) @@ -1304,7 +1380,7 @@ def test_run_training_with_data_tokenized_using_tokenizer_handler(): train_args = copy.deepcopy(TRAIN_ARGS) train_args.output_dir = tempdir - sft_trainer.train(MODEL_ARGS, data_args, train_args) + sft_trainer.train(copy.deepcopy(MODEL_ARGS), data_args, train_args) # validate full ft configs _validate_training(tempdir) @@ -1349,7 +1425,7 @@ def test_run_training_with_skip_large_column_handler(): train_args = copy.deepcopy(TRAIN_ARGS) train_args.output_dir = tempdir - sft_trainer.train(MODEL_ARGS, data_args, train_args) + sft_trainer.train(copy.deepcopy(MODEL_ARGS), data_args, train_args) # validate full ft configs _validate_training(tempdir) @@ -1482,7 +1558,7 @@ def test_run_chat_style_add_special_tokens_ft(): train_args = copy.deepcopy(TRAIN_ARGS) train_args.output_dir = tempdir - sft_trainer.train(MODEL_ARGS, data_args, train_args) + sft_trainer.train(copy.deepcopy(MODEL_ARGS), data_args, train_args) # validate the configs _validate_training(tempdir) @@ -1732,7 +1808,7 @@ def test_run_e2e_with_hf_dataset_id(data_args): train_args = copy.deepcopy(TRAIN_ARGS) train_args.output_dir = tempdir - sft_trainer.train(MODEL_ARGS, data_args, train_args) + sft_trainer.train(copy.deepcopy(MODEL_ARGS), data_args, train_args) # validate ft tuning configs _validate_training(tempdir) @@ -1759,18 +1835,14 @@ def test_run_moe_ft_and_inference_ep1_kernels(dataset_path, ep_degree): data_args = copy.deepcopy(DATA_ARGS) data_args.training_data_path = dataset_path model_args = copy.deepcopy(MODEL_ARGS) - model_args.model_name_or_path = "Isotonic/TinyMixtral-4x248M-MoE" + model_args.model_name_or_path = TINYMIXTRAL_MOE train_args = copy.deepcopy(TRAIN_ARGS) train_args.output_dir = tempdir fast_moe_config = FastMoeConfig(fast_moe=FastMoe(ep_degree=ep_degree)) sft_trainer.train( model_args, data_args, train_args, fast_moe_config=fast_moe_config ) - _test_run_inference( - checkpoint_path=os.path.join( - _get_checkpoint_path(tempdir), "hf_converted_checkpoint" - ) - ) + _test_run_inference(checkpoint_path=_get_hf_converted_path(tempdir)) @pytest.mark.skipif( @@ -1795,7 +1867,7 @@ def test_run_moe_lora_and_inference(dataset_path, target_modules, ep_degree): data_args = copy.deepcopy(DATA_ARGS) data_args.training_data_path = dataset_path model_args = copy.deepcopy(MODEL_ARGS) - model_args.model_name_or_path = "ibm-granite/granite-3.1-1b-a400m-base" + model_args.model_name_or_path = TINYMIXTRAL_MOE train_args = copy.deepcopy(TRAIN_ARGS) train_args.output_dir = tempdir lora_args = copy.deepcopy(PEFT_LORA_ARGS) @@ -1803,29 +1875,17 @@ def test_run_moe_lora_and_inference(dataset_path, target_modules, ep_degree): lora_args.target_modules = target_modules fast_moe_config = FastMoeConfig(fast_moe=FastMoe(ep_degree=ep_degree)) - if target_modules == "all-linear": - with pytest.raises(ValueError): - sft_trainer.train( - model_args, - data_args, - train_args, - lora_args, - fast_moe_config=fast_moe_config, - ) - else: - sft_trainer.train( - model_args, - data_args, - train_args, - lora_args, - fast_moe_config=fast_moe_config, - ) - _test_run_inference( - checkpoint_path=os.path.join( - _get_checkpoint_path(tempdir), "hf_converted_checkpoint" - ), - base_model_name_or_path="ibm-granite/granite-3.1-1b-a400m-base", - ) + sft_trainer.train( + model_args, + data_args, + train_args, + lora_args, + fast_moe_config=fast_moe_config, + ) + _test_run_inference( + checkpoint_path=_get_checkpoint_path(tempdir), + base_model_name_or_path=TINYMIXTRAL_MOE, + ) @pytest.mark.skipif( @@ -1845,7 +1905,7 @@ def test_run_moe_ft_with_save_model_dir(dataset_path): data_args = copy.deepcopy(DATA_ARGS) data_args.training_data_path = dataset_path model_args = copy.deepcopy(MODEL_ARGS) - model_args.model_name_or_path = "Isotonic/TinyMixtral-4x248M-MoE" + model_args.model_name_or_path = TINYMIXTRAL_MOE train_args = copy.deepcopy(TRAIN_ARGS) train_args.output_dir = tempdir train_args.save_model_dir = save_model_dir @@ -1853,12 +1913,14 @@ def test_run_moe_ft_with_save_model_dir(dataset_path): sft_trainer.train( model_args, data_args, train_args, fast_moe_config=fast_moe_config ) - assert os.path.exists(os.path.join(save_model_dir, "hf_converted_checkpoint")) + assert os.path.exists(os.path.join(save_model_dir)) ############################# Helper functions ############################# def _test_run_causallm_ft(training_args, model_args, data_args, tempdir): train_args = copy.deepcopy(training_args) + model_args = copy.deepcopy(model_args) + data_args = copy.deepcopy(data_args) train_args.output_dir = tempdir sft_trainer.train(model_args, data_args, train_args, None) @@ -1927,6 +1989,27 @@ def _get_checkpoint_path(dir_path): return os.path.join(dir_path, checkpoint_dirs[-1]) +def _get_hf_converted_path(dir_path): + checkpoint_dirs = [ + d + for d in os.listdir(dir_path) + if os.path.isdir(os.path.join(dir_path, d)) and re.match(r"^checkpoint-\d+$", d) + ] + checkpoint_dirs.sort(key=lambda name: int(name.split("-")[-1])) + + final_checkpoint_path = os.path.join(dir_path, checkpoint_dirs[-1]) + + hf_converted_dir = [ + d + for d in os.listdir(final_checkpoint_path) + if os.path.isdir(os.path.join(final_checkpoint_path, d)) + and re.match(r"^safetensors-\d+$", d) + ] + hf_converted_dir.sort(key=lambda name: int(name.split("-")[-1])) + + return os.path.join(final_checkpoint_path, hf_converted_dir[-1]) + + def _get_adapter_config(dir_path): with open(os.path.join(dir_path, "adapter_config.json"), encoding="utf-8") as f: return json.load(f) @@ -1974,7 +2057,10 @@ def test_tokenizer_has_no_eos_token(): # TypeError: can only concatenate str (not "NoneType") to str error # when we go to apply the data formatter. sft_trainer.train( - model_args, DATA_ARGS, train_args, copy.deepcopy(PEFT_LORA_ARGS) + model_args, + copy.deepcopy(DATA_ARGS), + train_args, + copy.deepcopy(PEFT_LORA_ARGS), ) _validate_training(tempdir) @@ -1988,7 +2074,10 @@ def test_invalid_dataset_text_field(): with pytest.raises(KeyError): sft_trainer.train( - MODEL_ARGS, data_args, TRAIN_ARGS, copy.deepcopy(PEFT_LORA_ARGS) + copy.deepcopy(MODEL_ARGS), + data_args, + copy.deepcopy(TRAIN_ARGS), + copy.deepcopy(PEFT_LORA_ARGS), ) @@ -2002,7 +2091,10 @@ def test_invalid_dataset_text_field_and_formatter_template(): with pytest.raises(ValueError): sft_trainer.train( - MODEL_ARGS, data_args, TRAIN_ARGS, copy.deepcopy(PEFT_LORA_ARGS) + copy.deepcopy(MODEL_ARGS), + data_args, + copy.deepcopy(TRAIN_ARGS), + copy.deepcopy(PEFT_LORA_ARGS), ) @@ -2016,7 +2108,10 @@ def test_invalid_formatter_template(): with pytest.raises(KeyError): sft_trainer.train( - MODEL_ARGS, data_args, TRAIN_ARGS, copy.deepcopy(PEFT_LORA_ARGS) + copy.deepcopy(MODEL_ARGS), + data_args, + copy.deepcopy(TRAIN_ARGS), + copy.deepcopy(PEFT_LORA_ARGS), ) @@ -2028,7 +2123,10 @@ def test_malformatted_data(): with pytest.raises((DatasetGenerationError, ValueError)): sft_trainer.train( - MODEL_ARGS, data_args, TRAIN_ARGS, copy.deepcopy(PEFT_LORA_ARGS) + copy.deepcopy(MODEL_ARGS), + data_args, + copy.deepcopy(TRAIN_ARGS), + copy.deepcopy(PEFT_LORA_ARGS), ) @@ -2039,7 +2137,10 @@ def test_empty_data(): with pytest.raises((DatasetGenerationError, ValueError)): sft_trainer.train( - MODEL_ARGS, data_args, TRAIN_ARGS, copy.deepcopy(PEFT_LORA_ARGS) + copy.deepcopy(MODEL_ARGS), + data_args, + copy.deepcopy(TRAIN_ARGS), + copy.deepcopy(PEFT_LORA_ARGS), ) @@ -2054,7 +2155,12 @@ def test_run_causallm_lora_with_invalid_modules(): lora_config.target_modules = ["foo", "bar"] # Peft should throw a value error about modules not matching the base module with pytest.raises(ValueError): - sft_trainer.train(MODEL_ARGS, DATA_ARGS, train_args, lora_config) + sft_trainer.train( + copy.deepcopy(MODEL_ARGS), + copy.deepcopy(DATA_ARGS), + train_args, + lora_config, + ) ### Direct validation tests based on whether or not packing is enabled @@ -2070,7 +2176,10 @@ def test_no_packing_needs_dataset_text_field_or_data_formatter_template(): with pytest.raises(ValueError): sft_trainer.train( - MODEL_ARGS, data_args, train_args, copy.deepcopy(PEFT_LORA_ARGS) + copy.deepcopy(MODEL_ARGS), + data_args, + train_args, + copy.deepcopy(PEFT_LORA_ARGS), ) @@ -2086,13 +2195,16 @@ def test_no_packing_needs_reponse_template(): with pytest.raises(ValueError): sft_trainer.train( - MODEL_ARGS, data_args, train_args, copy.deepcopy(PEFT_LORA_ARGS) + copy.deepcopy(MODEL_ARGS), + data_args, + train_args, + copy.deepcopy(PEFT_LORA_ARGS), ) ### Tests for model dtype edge cases @pytest.mark.skipif( - not (torch.cuda.is_available() and torch.cuda.is_bf16_supported()), + not (torch.cuda.is_available() and not torch.cuda.is_bf16_supported()), reason="Only runs if bf16 is unsupported", ) def test_bf16_still_tunes_if_unsupported(): @@ -2105,7 +2217,10 @@ def test_bf16_still_tunes_if_unsupported(): model_args.torch_dtype = "bfloat16" sft_trainer.train( - model_args, DATA_ARGS, train_args, copy.deepcopy(PEFT_LORA_ARGS) + model_args, + copy.deepcopy(DATA_ARGS), + train_args, + copy.deepcopy(PEFT_LORA_ARGS), ) _validate_training(tempdir) @@ -2120,7 +2235,10 @@ def test_bad_torch_dtype(): with pytest.raises(ValueError): sft_trainer.train( - model_args, DATA_ARGS, train_args, copy.deepcopy(PEFT_LORA_ARGS) + model_args, + copy.deepcopy(DATA_ARGS), + train_args, + copy.deepcopy(PEFT_LORA_ARGS), ) @@ -2132,8 +2250,8 @@ def test_run_with_additional_callbacks(): train_args.output_dir = tempdir sft_trainer.train( - MODEL_ARGS, - DATA_ARGS, + copy.deepcopy(MODEL_ARGS), + copy.deepcopy(DATA_ARGS), train_args, copy.deepcopy(PEFT_LORA_ARGS), additional_callbacks=[TrainerCallback()], @@ -2151,8 +2269,8 @@ def test_run_with_bad_additional_callbacks(): ValueError, match="additional callbacks should be of type TrainerCallback" ): sft_trainer.train( - MODEL_ARGS, - DATA_ARGS, + copy.deepcopy(MODEL_ARGS), + copy.deepcopy(DATA_ARGS), train_args, copy.deepcopy(PEFT_LORA_ARGS), additional_callbacks=["NotSupposedToBeHere"], @@ -2172,8 +2290,8 @@ def test_run_with_bad_experimental_metadata(): ValueError, match="exp metadata passed should be a dict with valid json" ): sft_trainer.train( - MODEL_ARGS, - DATA_ARGS, + copy.deepcopy(MODEL_ARGS), + copy.deepcopy(DATA_ARGS), train_args, copy.deepcopy(PEFT_LORA_ARGS), additional_callbacks=[TrainerCallback()], @@ -2191,8 +2309,8 @@ def test_run_with_good_experimental_metadata(): metadata = {"dead": "beef"} sft_trainer.train( - MODEL_ARGS, - DATA_ARGS, + copy.deepcopy(MODEL_ARGS), + copy.deepcopy(DATA_ARGS), train_args, copy.deepcopy(PEFT_LORA_ARGS), additional_callbacks=[TrainerCallback()], @@ -2218,7 +2336,10 @@ def test_pretokenized_dataset(dataset_path): data_args.response_template = None data_args.training_data_path = dataset_path sft_trainer.train( - MODEL_ARGS, data_args, train_args, copy.deepcopy(PEFT_LORA_ARGS) + copy.deepcopy(MODEL_ARGS), + data_args, + train_args, + copy.deepcopy(PEFT_LORA_ARGS), ) _validate_training(tempdir) @@ -2244,7 +2365,10 @@ def test_pretokenized_dataset_bad_args(dataset_text_field, response_template): # field or a response template if we have pretokenized data with pytest.raises(ValueError): sft_trainer.train( - MODEL_ARGS, data_args, train_args, copy.deepcopy(PEFT_LORA_ARGS) + copy.deepcopy(MODEL_ARGS), + data_args, + train_args, + copy.deepcopy(PEFT_LORA_ARGS), ) @@ -2264,7 +2388,10 @@ def test_pretokenized_dataset_wrong_format(): # is essentially swallowing a KeyError here. with pytest.raises(ValueError): sft_trainer.train( - MODEL_ARGS, data_args, train_args, copy.deepcopy(PEFT_LORA_ARGS) + copy.deepcopy(MODEL_ARGS), + data_args, + train_args, + copy.deepcopy(PEFT_LORA_ARGS), ) @@ -2295,8 +2422,8 @@ def test_run_with_bad_additional_data_handlers(additional_handlers): match="Handler should be of type tuning.data_handler.DataHandler, and name of str", ): sft_trainer.train( - MODEL_ARGS, - DATA_ARGS, + copy.deepcopy(MODEL_ARGS), + copy.deepcopy(DATA_ARGS), train_args, copy.deepcopy(PEFT_LORA_ARGS), additional_data_handlers=additional_handlers, @@ -2310,8 +2437,8 @@ def test_run_with_additional_data_handlers_as_none(): train_args.output_dir = tempdir sft_trainer.train( - MODEL_ARGS, - DATA_ARGS, + copy.deepcopy(MODEL_ARGS), + copy.deepcopy(DATA_ARGS), train_args, copy.deepcopy(PEFT_LORA_ARGS), additional_data_handlers=None, @@ -2357,8 +2484,8 @@ def test_handler(element, **kwargs): data_args.dataset_text_field = "custom_formatted_field" sft_trainer.train( - MODEL_ARGS, - DATA_ARGS, + copy.deepcopy(MODEL_ARGS), + copy.deepcopy(DATA_ARGS), train_args, copy.deepcopy(PEFT_LORA_ARGS), additional_data_handlers={ @@ -2464,7 +2591,7 @@ def test_online_data_mixing_plugin_sample_training( train_args.eval_strategy = "steps" train_args.eval_steps = 1 - sft_trainer.train(MODEL_ARGS, data_formatting_args, train_args) + sft_trainer.train(copy.deepcopy(MODEL_ARGS), data_formatting_args, train_args) # validate full ft configs _validate_training(tempdir) @@ -2548,7 +2675,7 @@ def test_online_data_mixing_plugin_sample_training_no_validation_split( train_args.eval_strategy = "steps" train_args.eval_steps = 1 - sft_trainer.train(MODEL_ARGS, data_formatting_args, train_args) + sft_trainer.train(copy.deepcopy(MODEL_ARGS), data_formatting_args, train_args) # validate full ft configs _validate_training(tempdir) @@ -2629,7 +2756,7 @@ def test_online_data_mixing_plugin_with_auto_categorization( train_args.eval_strategy = "steps" train_args.eval_steps = 1 - sft_trainer.train(MODEL_ARGS, data_formatting_args, train_args) + sft_trainer.train(copy.deepcopy(MODEL_ARGS), data_formatting_args, train_args) # validate full ft configs _validate_training(tempdir) diff --git a/tests/trackers/test_aim_tracker.py b/tests/trackers/test_aim_tracker.py index a641c8be1..4373a8242 100644 --- a/tests/trackers/test_aim_tracker.py +++ b/tests/trackers/test_aim_tracker.py @@ -71,7 +71,9 @@ def test_run_with_aim_tracker_name_but_no_args(): ValueError, match="Aim tracker requested but repo or server is not specified.", ): - sft_trainer.train(MODEL_ARGS, DATA_ARGS, train_args) + sft_trainer.train( + copy.deepcopy(MODEL_ARGS), copy.deepcopy(DATA_ARGS), train_args + ) @pytest.mark.skipif(aim_not_available, reason="Requires aimstack to be installed") @@ -90,7 +92,10 @@ def test_e2e_run_with_aim_tracker(aimrepo): tracker_configs = TrackerConfigs(experiment="unit_test", aim_repo=aimrepo) sft_trainer.train( - MODEL_ARGS, DATA_ARGS, train_args, tracker_configs=tracker_configs + copy.deepcopy(MODEL_ARGS), + copy.deepcopy(DATA_ARGS), + train_args, + tracker_configs=tracker_configs, ) # validate ft tuning configs @@ -116,7 +121,10 @@ def test_e2e_run_with_aim_runid_export_default_path(aimrepo): tracker_configs = TrackerConfigs(experiment="unit_test", aim_repo=aimrepo) sft_trainer.train( - MODEL_ARGS, DATA_ARGS, train_args, tracker_configs=tracker_configs + copy.deepcopy(MODEL_ARGS), + copy.deepcopy(DATA_ARGS), + train_args, + tracker_configs=tracker_configs, ) # validate ft tuning configs diff --git a/tests/trackers/test_clearml_tracker.py b/tests/trackers/test_clearml_tracker.py index 1ae6a19b3..74e7f4d3f 100644 --- a/tests/trackers/test_clearml_tracker.py +++ b/tests/trackers/test_clearml_tracker.py @@ -84,7 +84,10 @@ def test_e2e_run_with_clearml_tracker(): ) sft_trainer.train( - MODEL_ARGS, DATA_ARGS, train_args, tracker_configs=tracker_configs + copy.deepcopy(MODEL_ARGS), + copy.deepcopy(DATA_ARGS), + train_args, + tracker_configs=tracker_configs, ) # validate ft tuning configs @@ -115,7 +118,10 @@ def test_e2e_run_with_clearml_runuri_export_default_path(): tracker_configs = TrackerConfigs(clearml_task="unit_test") sft_trainer.train( - MODEL_ARGS, DATA_ARGS, train_args, tracker_configs=tracker_configs + copy.deepcopy(MODEL_ARGS), + copy.deepcopy(DATA_ARGS), + train_args, + tracker_configs=tracker_configs, ) # validate ft tuning configs diff --git a/tests/trackers/test_file_logging_tracker.py b/tests/trackers/test_file_logging_tracker.py index 669b2f51d..6d56dc926 100644 --- a/tests/trackers/test_file_logging_tracker.py +++ b/tests/trackers/test_file_logging_tracker.py @@ -62,7 +62,10 @@ def test_sample_run_with_file_logger_updated_filename(): tracker_configs = TrackerConfigs(training_logs_filename=logs_file) sft_trainer.train( - MODEL_ARGS, DATA_ARGS, train_args, tracker_configs=tracker_configs + copy.deepcopy(MODEL_ARGS), + copy.deepcopy(DATA_ARGS), + train_args, + tracker_configs=tracker_configs, ) # validate ft tuning configs diff --git a/tests/trackers/test_hf_resource_scanner_tracker.py b/tests/trackers/test_hf_resource_scanner_tracker.py index 237ca8cbf..b4ca8b1ee 100644 --- a/tests/trackers/test_hf_resource_scanner_tracker.py +++ b/tests/trackers/test_hf_resource_scanner_tracker.py @@ -79,7 +79,10 @@ def test_sample_run_with_hf_resource_scanner_updated_filename(): ) sft_trainer.train( - MODEL_ARGS, DATA_ARGS, train_args, tracker_configs=tracker_configs + copy.deepcopy(MODEL_ARGS), + copy.deepcopy(DATA_ARGS), + train_args, + tracker_configs=tracker_configs, ) # validate ft tuning configs diff --git a/tests/trackers/test_mlflow_tracker.py b/tests/trackers/test_mlflow_tracker.py index f7e24a069..7b84d160c 100644 --- a/tests/trackers/test_mlflow_tracker.py +++ b/tests/trackers/test_mlflow_tracker.py @@ -56,7 +56,9 @@ def test_run_with_mlflow_tracker_name_but_no_args(): ValueError, match="mlflow tracker requested but mlflow_uri is not specified.", ): - sft_trainer.train(MODEL_ARGS, DATA_ARGS, train_args) + sft_trainer.train( + copy.deepcopy(MODEL_ARGS), copy.deepcopy(DATA_ARGS), train_args + ) @pytest.mark.skipif(mlflow_not_available, reason="Requires mlflow to be installed") @@ -86,7 +88,10 @@ def test_e2e_run_with_mlflow_tracker(): ) sft_trainer.train( - MODEL_ARGS, DATA_ARGS, train_args, tracker_configs=tracker_configs + copy.deepcopy(MODEL_ARGS), + copy.deepcopy(DATA_ARGS), + train_args, + tracker_configs=tracker_configs, ) # validate ft tuning configs @@ -116,7 +121,10 @@ def test_e2e_run_with_mlflow_runuri_export_default_path(): ) sft_trainer.train( - MODEL_ARGS, DATA_ARGS, train_args, tracker_configs=tracker_configs + copy.deepcopy(MODEL_ARGS), + copy.deepcopy(DATA_ARGS), + train_args, + tracker_configs=tracker_configs, ) # validate ft tuning configs diff --git a/tests/trackers/test_tracker_api.py b/tests/trackers/test_tracker_api.py index 7e3a13152..68ad0d622 100644 --- a/tests/trackers/test_tracker_api.py +++ b/tests/trackers/test_tracker_api.py @@ -41,8 +41,8 @@ def test_run_with_bad_tracker_config(): match="tracker configs should adhere to the TrackerConfigs type", ): sft_trainer.train( - MODEL_ARGS, - DATA_ARGS, + copy.deepcopy(MODEL_ARGS), + copy.deepcopy(DATA_ARGS), train_args, tracker_configs="NotSupposedToBeHere", ) @@ -63,7 +63,7 @@ def test_run_with_bad_tracker_name(): ValueError, match=r"Requested Tracker {} not found.".format(bad_name) ): sft_trainer.train( - MODEL_ARGS, - DATA_ARGS, + copy.deepcopy(MODEL_ARGS), + copy.deepcopy(DATA_ARGS), train_args, ) diff --git a/tests/utils/test_config_utils.py b/tests/utils/test_config_utils.py index de9d08bd6..fd9325199 100644 --- a/tests/utils/test_config_utils.py +++ b/tests/utils/test_config_utils.py @@ -17,7 +17,6 @@ # Standard import base64 -import os import pickle # Third Party @@ -208,20 +207,19 @@ def test_update_config_can_handle_multiple_config_updates(): assert config[1].r == 98 -def test_get_json_config_can_load_from_path(): +def test_get_json_config_can_load_from_path(monkeypatch): """Test that the function get_json_config can read the json path from env var SFT_TRAINER_CONFIG_JSON_PATH """ - if "SFT_TRAINER_CONFIG_JSON_ENV_VAR" in os.environ: - del os.environ["SFT_TRAINER_CONFIG_JSON_ENV_VAR"] - os.environ["SFT_TRAINER_CONFIG_JSON_PATH"] = HAPPY_PATH_DUMMY_CONFIG_PATH + monkeypatch.delenv("SFT_TRAINER_CONFIG_JSON_ENV_VAR", raising=False) + monkeypatch.setenv("SFT_TRAINER_CONFIG_JSON_PATH", HAPPY_PATH_DUMMY_CONFIG_PATH) job_config = config_utils.get_json_config() assert job_config is not None assert job_config["model_name_or_path"] == "bigscience/bloom-560m" -def test_get_json_config_can_load_from_envvar(): +def test_get_json_config_can_load_from_envvar(monkeypatch): """Test that the function get_json_config can read the json path from env var SFT_TRAINER_CONFIG_JSON_ENV_VAR """ @@ -229,7 +227,8 @@ def test_get_json_config_can_load_from_envvar(): message_bytes = pickle.dumps(config_json) base64_bytes = base64.b64encode(message_bytes) encoded_json = base64_bytes.decode("ascii") - os.environ["SFT_TRAINER_CONFIG_JSON_ENV_VAR"] = encoded_json + monkeypatch.delenv("SFT_TRAINER_CONFIG_JSON_PATH", raising=False) + monkeypatch.setenv("SFT_TRAINER_CONFIG_JSON_ENV_VAR", encoded_json) job_config = config_utils.get_json_config() assert job_config is not None diff --git a/tests/utils/test_embedding_resize.py b/tests/utils/test_embedding_resize.py index a3a55e501..5ec7e7ab7 100644 --- a/tests/utils/test_embedding_resize.py +++ b/tests/utils/test_embedding_resize.py @@ -47,6 +47,8 @@ def _inference( ) -> str: device = "cuda" if torch.cuda.is_available() else "cpu" tokenized_input = tokenizer(input_text, return_tensors="pt").to(device) + model = model.to(device) + generated_output = model.generate( **tokenized_input, max_new_tokens=max_new_tokens, diff --git a/tox.ini b/tox.ini index 0f33a5261..977ce1108 100644 --- a/tox.ini +++ b/tox.ini @@ -53,3 +53,33 @@ commands = coverage report -m coverage xml genbadge coverage -s -i coverage.xml + +[testenv:accel] +description = run all unit tests including requring GPU support +deps = + pytest>=7 + .[aim,mlflow,clearml,scanner-dev,fms-accel-all] +setenv = + CUDA_VISIBLE_DEVICES=0 +commands_pre = + pip install --no-build-isolation .[flash-attn] +commands = + pytest tests/acceleration + pytest tests/build + pytest tests/data + pytest tests/trackers + pytest tests/trainercontroller + pytest tests/utils + pytest tests/test_sft_trainer.py + +[testenv:gpu] +description = run all unit tests including requring GPU support +deps = + pytest>=7 + .[aim,mlflow,clearml,scanner-dev,fms-accel-all] +setenv = + CUDA_VISIBLE_DEVICES=0 +commands_pre = + pip install --no-build-isolation .[flash-attn] +commands = + pytest {posargs:tests} diff --git a/tuning/config/acceleration_configs/acceleration_framework_config.py b/tuning/config/acceleration_configs/acceleration_framework_config.py index 09309cbfa..78bfa63fe 100644 --- a/tuning/config/acceleration_configs/acceleration_framework_config.py +++ b/tuning/config/acceleration_configs/acceleration_framework_config.py @@ -240,6 +240,9 @@ def get_framework(self): try: with NamedTemporaryFile("w") as f: self.to_yaml(f.name) + + AccelerationFramework.active_plugins = [] + AccelerationFramework.plugins_require_custom_loading = [] return AccelerationFramework(f.name) except ValueError as e: (msg,) = e.args