From c1d857cf1c61b625414cfa5d2ca0ef017f16cc1a Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 3 Feb 2026 03:52:46 +0000 Subject: [PATCH 1/4] Initial plan From 2cd10a6c0384fc691d66b77fc86d07ce26c4e26f Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 3 Feb 2026 03:59:53 +0000 Subject: [PATCH 2/4] Add support for torch.distributed as alias for pytorch distribution type - Added TORCH_DISTRIBUTED constant as legacy alias in DistributionType - Updated DISTRIBUTION_TYPE_MAP to map both pytorch and torch.distributed to PyTorchDistribution - Updated PyTorchDistributionSchema to accept both values in allowed_values - Added test YAML file with torch.distributed type - Added unit test to verify both values work interchangeably Co-authored-by: mohammadsheraj <221899694+mohammadsheraj@users.noreply.github.com> --- .../azure/ai/ml/_schema/job/distribution.py | 5 ++- .../azure/ai/ml/constants/_job/job.py | 2 ++ .../azure/ai/ml/entities/_job/distribution.py | 2 ++ .../unittests/test_command_job_schema.py | 31 +++++++++++++++++++ .../dist_job_pytorch_torch_distributed.yml | 14 +++++++++ 5 files changed, 53 insertions(+), 1 deletion(-) create mode 100644 sdk/ml/azure-ai-ml/tests/test_configs/command_job/dist_job_pytorch_torch_distributed.yml diff --git a/sdk/ml/azure-ai-ml/azure/ai/ml/_schema/job/distribution.py b/sdk/ml/azure-ai-ml/azure/ai/ml/_schema/job/distribution.py index 475792a36b97..5ce5deff3998 100644 --- a/sdk/ml/azure-ai-ml/azure/ai/ml/_schema/job/distribution.py +++ b/sdk/ml/azure-ai-ml/azure/ai/ml/_schema/job/distribution.py @@ -59,7 +59,10 @@ def predump(self, data, **kwargs): class PyTorchDistributionSchema(metaclass=PatchedSchemaMeta): - type = StringTransformedEnum(required=True, allowed_values=DistributionType.PYTORCH) + type = StringTransformedEnum( + required=True, + allowed_values=[DistributionType.PYTORCH, DistributionType.TORCH_DISTRIBUTED] + ) process_count_per_instance = fields.Int() @post_load diff --git a/sdk/ml/azure-ai-ml/azure/ai/ml/constants/_job/job.py b/sdk/ml/azure-ai-ml/azure/ai/ml/constants/_job/job.py index 4037dbb305a9..0d64e438b71a 100644 --- a/sdk/ml/azure-ai-ml/azure/ai/ml/constants/_job/job.py +++ b/sdk/ml/azure-ai-ml/azure/ai/ml/constants/_job/job.py @@ -8,6 +8,8 @@ class DistributionType: TENSORFLOW = "tensorflow" PYTORCH = "pytorch" RAY = "ray" + # Legacy alias for backwards compatibility with AML SDK v1.5 + TORCH_DISTRIBUTED = "torch.distributed" class JobType(object): diff --git a/sdk/ml/azure-ai-ml/azure/ai/ml/entities/_job/distribution.py b/sdk/ml/azure-ai-ml/azure/ai/ml/entities/_job/distribution.py index ec7277c647eb..5ec9f245350c 100644 --- a/sdk/ml/azure-ai-ml/azure/ai/ml/entities/_job/distribution.py +++ b/sdk/ml/azure-ai-ml/azure/ai/ml/entities/_job/distribution.py @@ -226,4 +226,6 @@ def _to_rest_object(self) -> RestRay: DistributionType.TENSORFLOW: TensorFlowDistribution, DistributionType.PYTORCH: PyTorchDistribution, DistributionType.RAY: RayDistribution, + # Support legacy alias for backwards compatibility with AML SDK v1.5 + DistributionType.TORCH_DISTRIBUTED: PyTorchDistribution, } diff --git a/sdk/ml/azure-ai-ml/tests/command_job/unittests/test_command_job_schema.py b/sdk/ml/azure-ai-ml/tests/command_job/unittests/test_command_job_schema.py index efb2b46309ed..7a92bfd6d504 100644 --- a/sdk/ml/azure-ai-ml/tests/command_job/unittests/test_command_job_schema.py +++ b/sdk/ml/azure-ai-ml/tests/command_job/unittests/test_command_job_schema.py @@ -73,6 +73,37 @@ def test_invalid_distribution_config(self): with pytest.raises(ValidationError): CommandJob(**schema.load(cfg)) + def test_pytorch_torch_distributed_interchangeable(self): + """Test that type: pytorch and type: torch.distributed work interchangeably""" + # Test with torch.distributed type + path_torch_distributed = "./tests/test_configs/command_job/dist_job_pytorch_torch_distributed.yml" + # Test with pytorch type (existing file) + path_pytorch = "./tests/test_configs/command_job/dist_job_1.yml" + + context = {BASE_PATH_CONTEXT_KEY: Path(path_torch_distributed).parent} + schema = CommandJobSchema(context=context) + + # Load and verify torch.distributed type + with open(path_torch_distributed, "r") as f: + cfg_torch_distributed = yaml.safe_load(f) + job_torch_distributed = CommandJob(**schema.load(cfg_torch_distributed)) + + # Load and verify pytorch type + with open(path_pytorch, "r") as f: + cfg_pytorch = yaml.safe_load(f) + job_pytorch = CommandJob(**schema.load(cfg_pytorch)) + + # Both should create PyTorchDistribution objects + from azure.ai.ml.entities._job.distribution import PyTorchDistribution + assert isinstance(job_torch_distributed.distribution, PyTorchDistribution) + assert isinstance(job_pytorch.distribution, PyTorchDistribution) + + # Verify roundtrip for torch.distributed + rest_obj = job_torch_distributed._to_rest_object() + reconstructed = CommandJob._load_from_rest(rest_obj) + assert isinstance(reconstructed.distribution, PyTorchDistribution) + assert reconstructed.distribution.process_count_per_instance == 4 + def test_deserialize_inputs(self): test_path = "./tests/test_configs/command_job/command_job_inputs_test.yml" with open("./tests/test_configs/command_job/command_job_inputs_rest.yml", "r") as f: diff --git a/sdk/ml/azure-ai-ml/tests/test_configs/command_job/dist_job_pytorch_torch_distributed.yml b/sdk/ml/azure-ai-ml/tests/test_configs/command_job/dist_job_pytorch_torch_distributed.yml new file mode 100644 index 000000000000..3803a6821ede --- /dev/null +++ b/sdk/ml/azure-ai-ml/tests/test_configs/command_job/dist_job_pytorch_torch_distributed.yml @@ -0,0 +1,14 @@ +command: pip freeze +environment: azureml:AzureML-sklearn-1.0-ubuntu20.04-py38-cpu:33 +name: "test_pytorch_torch_distributed" +compute: "azureml:testCompute" +distribution: + type: "torch.distributed" + process_count_per_instance: 4 +experiment_name: mfe-test1 +properties: + test_property: test_value +resources: + instance_count: 2 +limits: + timeout: 30 From d4bdcb4d6e55dadbba163b48c4d43f1310caa594 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 3 Feb 2026 04:02:22 +0000 Subject: [PATCH 3/4] Update pipeline helpers to support torch.distributed - Modified from_dict_to_rest_distribution to accept both pytorch and torch.distributed - Updated error message to include torch.distributed in valid types - Added comprehensive unit tests for distribution helper functions Co-authored-by: mohammadsheraj <221899694+mohammadsheraj@users.noreply.github.com> --- .../_job/pipeline/_pipeline_job_helpers.py | 4 +- .../unittests/test_distribution_helpers.py | 54 +++++++++++++++++++ 2 files changed, 56 insertions(+), 2 deletions(-) create mode 100644 sdk/ml/azure-ai-ml/tests/pipeline_job/unittests/test_distribution_helpers.py diff --git a/sdk/ml/azure-ai-ml/azure/ai/ml/entities/_job/pipeline/_pipeline_job_helpers.py b/sdk/ml/azure-ai-ml/azure/ai/ml/entities/_job/pipeline/_pipeline_job_helpers.py index 3a7d89e75985..8906d1e36c92 100644 --- a/sdk/ml/azure-ai-ml/azure/ai/ml/entities/_job/pipeline/_pipeline_job_helpers.py +++ b/sdk/ml/azure-ai-ml/azure/ai/ml/entities/_job/pipeline/_pipeline_job_helpers.py @@ -165,7 +165,7 @@ def from_dict_to_rest_io( def from_dict_to_rest_distribution(distribution_dict: Dict) -> Union[PyTorch, Mpi, TensorFlow, Ray]: target_type = distribution_dict["distribution_type"].lower() - if target_type == "pytorch": + if target_type == "pytorch" or target_type == "torch.distributed": return PyTorch(**distribution_dict) if target_type == "mpi": return Mpi(**distribution_dict) @@ -173,7 +173,7 @@ def from_dict_to_rest_distribution(distribution_dict: Dict) -> Union[PyTorch, Mp return TensorFlow(**distribution_dict) if target_type == "ray": return Ray(**distribution_dict) - msg = "Distribution type must be pytorch, mpi, tensorflow or ray: {}".format(target_type) + msg = "Distribution type must be pytorch, torch.distributed, mpi, tensorflow or ray: {}".format(target_type) raise ValidationException( message=msg, no_personal_data_message=msg, diff --git a/sdk/ml/azure-ai-ml/tests/pipeline_job/unittests/test_distribution_helpers.py b/sdk/ml/azure-ai-ml/tests/pipeline_job/unittests/test_distribution_helpers.py new file mode 100644 index 000000000000..e37ed683c73a --- /dev/null +++ b/sdk/ml/azure-ai-ml/tests/pipeline_job/unittests/test_distribution_helpers.py @@ -0,0 +1,54 @@ +# --------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# --------------------------------------------------------- + +import pytest +from azure.ai.ml.entities._job.pipeline._pipeline_job_helpers import from_dict_to_rest_distribution + + +@pytest.mark.unittest +@pytest.mark.training_experiences_test +class TestDistributionHelpers: + """Test distribution helper functions to ensure torch.distributed is supported""" + + def test_from_dict_to_rest_distribution_pytorch(self): + """Test that pytorch type is properly handled""" + distribution_dict = { + "distribution_type": "pytorch", + "process_count_per_instance": 4 + } + result = from_dict_to_rest_distribution(distribution_dict) + assert result is not None + assert hasattr(result, 'process_count_per_instance') + assert result.process_count_per_instance == 4 + + def test_from_dict_to_rest_distribution_torch_distributed(self): + """Test that torch.distributed type is properly handled""" + distribution_dict = { + "distribution_type": "torch.distributed", + "process_count_per_instance": 4 + } + result = from_dict_to_rest_distribution(distribution_dict) + assert result is not None + assert hasattr(result, 'process_count_per_instance') + assert result.process_count_per_instance == 4 + + def test_from_dict_to_rest_distribution_pytorch_case_insensitive(self): + """Test that PyTorch (mixed case) type is properly handled""" + distribution_dict = { + "distribution_type": "PyTorch", + "process_count_per_instance": 2 + } + result = from_dict_to_rest_distribution(distribution_dict) + assert result is not None + assert result.process_count_per_instance == 2 + + def test_from_dict_to_rest_distribution_torch_distributed_case_insensitive(self): + """Test that torch.distributed (mixed case) type is properly handled""" + distribution_dict = { + "distribution_type": "Torch.Distributed", + "process_count_per_instance": 2 + } + result = from_dict_to_rest_distribution(distribution_dict) + assert result is not None + assert result.process_count_per_instance == 2 From 5d886262b2c0096eaa681bdf5db6db5087e4dee7 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 3 Feb 2026 04:03:43 +0000 Subject: [PATCH 4/4] Add torch.distributed to SDK_TO_REST mapping - Added TORCH_DISTRIBUTED mapping to RestDistributionType.PY_TORCH - Ensures proper REST API conversion for both pytorch and torch.distributed Co-authored-by: mohammadsheraj <221899694+mohammadsheraj@users.noreply.github.com> --- sdk/ml/azure-ai-ml/azure/ai/ml/entities/_job/distribution.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sdk/ml/azure-ai-ml/azure/ai/ml/entities/_job/distribution.py b/sdk/ml/azure-ai-ml/azure/ai/ml/entities/_job/distribution.py index 5ec9f245350c..1ce90def73fb 100644 --- a/sdk/ml/azure-ai-ml/azure/ai/ml/entities/_job/distribution.py +++ b/sdk/ml/azure-ai-ml/azure/ai/ml/entities/_job/distribution.py @@ -23,6 +23,8 @@ DistributionType.TENSORFLOW: RestDistributionType.TENSOR_FLOW, DistributionType.PYTORCH: RestDistributionType.PY_TORCH, DistributionType.RAY: RestDistributionType.RAY, + # Support legacy alias - maps to the same REST type as PYTORCH + DistributionType.TORCH_DISTRIBUTED: RestDistributionType.PY_TORCH, }