From c51b3a3854520eece12fe49e22799b68d71cb2c1 Mon Sep 17 00:00:00 2001 From: Gleb Kanterov Date: Fri, 15 Aug 2025 10:38:12 +0200 Subject: [PATCH 1/3] [Python] Generate each resource in separate namespace --- .../codegen/codegen/generated_dataclass.py | 20 +- .../codegen/generated_dataclass_patch.py | 9 +- .../python/codegen/codegen/generated_enum.py | 4 +- .../codegen/codegen/generated_imports.py | 12 +- experimental/python/codegen/codegen/main.py | 39 +-- .../python/codegen/codegen/packages.py | 27 +- .../codegen_tests/test_generated_dataclass.py | 9 +- .../codegen_tests/test_generated_enum.py | 3 +- .../databricks/bundles/jobs/__init__.py | 234 +++++++++--------- .../_models/adlsgen2_info.py | 0 .../{compute => jobs}/_models/auto_scale.py | 0 .../_models/aws_attributes.py | 10 +- .../_models/aws_availability.py | 0 .../_models/azure_attributes.py | 10 +- .../_models/azure_availability.py | 0 .../_models/clients_types.py | 0 .../_models/cluster_log_conf.py | 12 +- .../{compute => jobs}/_models/cluster_spec.py | 37 ++- .../_models/data_security_mode.py | 0 .../_models/dbfs_storage_info.py | 0 .../_models/docker_basic_auth.py | 0 .../{compute => jobs}/_models/docker_image.py | 8 +- .../_models/ebs_volume_type.py | 0 .../{compute => jobs}/_models/environment.py | 0 .../_models/gcp_attributes.py | 8 +- .../_models/gcp_availability.py | 0 .../_models/gcs_storage_info.py | 0 .../_models/init_script_info.py | 18 +- .../bundles/jobs/_models/job_cluster.py | 5 +- .../bundles/jobs/_models/job_environment.py | 2 +- .../{compute => jobs}/_models/library.py | 12 +- .../_models/local_file_info.py | 0 .../_models/log_analytics_info.py | 0 .../_models/maven_library.py | 0 .../_models/python_py_pi_library.py | 0 .../_models/r_cran_library.py | 0 .../_models/runtime_engine.py | 0 .../_models/s3_storage_info.py | 0 .../databricks/bundles/jobs/_models/task.py | 10 +- .../_models/volumes_storage_info.py | 0 .../_models/workload_type.py | 8 +- .../_models/workspace_storage_info.py | 0 .../databricks/bundles/pipelines/__init__.py | 116 ++++----- .../pipelines/_models/adlsgen2_info.py | 40 +++ .../pipelines/_models/aws_attributes.py | 216 ++++++++++++++++ .../pipelines/_models/aws_availability.py | 19 ++ .../pipelines/_models/azure_attributes.py | 90 +++++++ .../pipelines/_models/azure_availability.py | 19 ++ .../pipelines/_models/cluster_log_conf.py | 82 ++++++ .../pipelines/_models/dbfs_storage_info.py | 40 +++ .../pipelines/_models/ebs_volume_type.py | 17 ++ .../pipelines/_models/gcp_attributes.py | 100 ++++++++ .../pipelines/_models/gcp_availability.py | 19 ++ .../pipelines/_models/gcs_storage_info.py | 40 +++ .../pipelines/_models/ingestion_config.py | 5 +- .../pipelines/_models/init_script_info.py | 129 ++++++++++ .../pipelines/_models/local_file_info.py | 38 +++ .../pipelines/_models/log_analytics_info.py | 48 ++++ .../pipelines/_models/maven_library.py | 70 ++++++ .../pipelines/_models/pipeline_cluster.py | 24 +- .../pipelines/_models/pipeline_library.py | 8 +- .../pipelines/_models/s3_storage_info.py | 122 +++++++++ .../pipelines/_models/volumes_storage_info.py | 42 ++++ .../_models/workspace_storage_info.py | 40 +++ 64 files changed, 1492 insertions(+), 329 deletions(-) rename experimental/python/databricks/bundles/{compute => jobs}/_models/adlsgen2_info.py (100%) rename experimental/python/databricks/bundles/{compute => jobs}/_models/auto_scale.py (100%) rename experimental/python/databricks/bundles/{compute => jobs}/_models/aws_attributes.py (98%) rename experimental/python/databricks/bundles/{compute => jobs}/_models/aws_availability.py (100%) rename experimental/python/databricks/bundles/{compute => jobs}/_models/azure_attributes.py (96%) rename experimental/python/databricks/bundles/{compute => jobs}/_models/azure_availability.py (100%) rename experimental/python/databricks/bundles/{compute => jobs}/_models/clients_types.py (100%) rename experimental/python/databricks/bundles/{compute => jobs}/_models/cluster_log_conf.py (92%) rename experimental/python/databricks/bundles/{compute => jobs}/_models/cluster_spec.py (96%) rename experimental/python/databricks/bundles/{compute => jobs}/_models/data_security_mode.py (100%) rename experimental/python/databricks/bundles/{compute => jobs}/_models/dbfs_storage_info.py (100%) rename experimental/python/databricks/bundles/{compute => jobs}/_models/docker_basic_auth.py (100%) rename experimental/python/databricks/bundles/{compute => jobs}/_models/docker_image.py (94%) rename experimental/python/databricks/bundles/{compute => jobs}/_models/ebs_volume_type.py (100%) rename experimental/python/databricks/bundles/{compute => jobs}/_models/environment.py (100%) rename experimental/python/databricks/bundles/{compute => jobs}/_models/gcp_attributes.py (98%) rename experimental/python/databricks/bundles/{compute => jobs}/_models/gcp_availability.py (100%) rename experimental/python/databricks/bundles/{compute => jobs}/_models/gcs_storage_info.py (100%) rename experimental/python/databricks/bundles/{compute => jobs}/_models/init_script_info.py (90%) rename experimental/python/databricks/bundles/{compute => jobs}/_models/library.py (95%) rename experimental/python/databricks/bundles/{compute => jobs}/_models/local_file_info.py (100%) rename experimental/python/databricks/bundles/{compute => jobs}/_models/log_analytics_info.py (100%) rename experimental/python/databricks/bundles/{compute => jobs}/_models/maven_library.py (100%) rename experimental/python/databricks/bundles/{compute => jobs}/_models/python_py_pi_library.py (100%) rename experimental/python/databricks/bundles/{compute => jobs}/_models/r_cran_library.py (100%) rename experimental/python/databricks/bundles/{compute => jobs}/_models/runtime_engine.py (100%) rename experimental/python/databricks/bundles/{compute => jobs}/_models/s3_storage_info.py (100%) rename experimental/python/databricks/bundles/{compute => jobs}/_models/volumes_storage_info.py (100%) rename experimental/python/databricks/bundles/{compute => jobs}/_models/workload_type.py (94%) rename experimental/python/databricks/bundles/{compute => jobs}/_models/workspace_storage_info.py (100%) create mode 100644 experimental/python/databricks/bundles/pipelines/_models/adlsgen2_info.py create mode 100644 experimental/python/databricks/bundles/pipelines/_models/aws_attributes.py create mode 100644 experimental/python/databricks/bundles/pipelines/_models/aws_availability.py create mode 100644 experimental/python/databricks/bundles/pipelines/_models/azure_attributes.py create mode 100644 experimental/python/databricks/bundles/pipelines/_models/azure_availability.py create mode 100644 experimental/python/databricks/bundles/pipelines/_models/cluster_log_conf.py create mode 100644 experimental/python/databricks/bundles/pipelines/_models/dbfs_storage_info.py create mode 100644 experimental/python/databricks/bundles/pipelines/_models/ebs_volume_type.py create mode 100644 experimental/python/databricks/bundles/pipelines/_models/gcp_attributes.py create mode 100644 experimental/python/databricks/bundles/pipelines/_models/gcp_availability.py create mode 100644 experimental/python/databricks/bundles/pipelines/_models/gcs_storage_info.py create mode 100644 experimental/python/databricks/bundles/pipelines/_models/init_script_info.py create mode 100644 experimental/python/databricks/bundles/pipelines/_models/local_file_info.py create mode 100644 experimental/python/databricks/bundles/pipelines/_models/log_analytics_info.py create mode 100644 experimental/python/databricks/bundles/pipelines/_models/maven_library.py create mode 100644 experimental/python/databricks/bundles/pipelines/_models/s3_storage_info.py create mode 100644 experimental/python/databricks/bundles/pipelines/_models/volumes_storage_info.py create mode 100644 experimental/python/databricks/bundles/pipelines/_models/workspace_storage_info.py diff --git a/experimental/python/codegen/codegen/generated_dataclass.py b/experimental/python/codegen/codegen/generated_dataclass.py index 5a1d5a9d6d..e1888aadab 100644 --- a/experimental/python/codegen/codegen/generated_dataclass.py +++ b/experimental/python/codegen/codegen/generated_dataclass.py @@ -134,12 +134,13 @@ class GeneratedDataclass: def generate_field( + namespace: str, field_name: str, prop: Property, is_required: bool, ) -> GeneratedField: - field_type = generate_type(prop.ref, is_param=False) - param_type = generate_type(prop.ref, is_param=True) + field_type = generate_type(namespace, prop.ref, is_param=False) + param_type = generate_type(namespace, prop.ref, is_param=True) field_type = variable_or_type(field_type, is_required=is_required) param_type = variable_or_type(param_type, is_required=is_required) @@ -255,10 +256,11 @@ def variable_or_dict_type(element_type: GeneratedType) -> GeneratedType: ) -def generate_type(ref: str, is_param: bool) -> GeneratedType: +def generate_type(namespace: str, ref: str, is_param: bool) -> GeneratedType: if ref.startswith("#/$defs/slice/"): element_ref = ref.replace("#/$defs/slice/", "#/$defs/") element_type = generate_type( + namespace=namespace, ref=element_ref, is_param=is_param, ) @@ -273,7 +275,7 @@ def generate_type(ref: str, is_param: bool) -> GeneratedType: return dict_type() class_name = packages.get_class_name(ref) - package = packages.get_package(ref) + package = packages.get_package(namespace, ref) if is_param and package: class_name += "Param" @@ -293,7 +295,11 @@ def resource_type() -> GeneratedType: ) -def generate_dataclass(schema_name: str, schema: Schema) -> GeneratedDataclass: +def generate_dataclass( + namespace: str, + schema_name: str, + schema: Schema, +) -> GeneratedDataclass: print(f"Generating dataclass for {schema_name}") fields = list[GeneratedField]() @@ -301,12 +307,12 @@ def generate_dataclass(schema_name: str, schema: Schema) -> GeneratedDataclass: for name, prop in schema.properties.items(): is_required = name in schema.required - field = generate_field(name, prop, is_required=is_required) + field = generate_field(namespace, name, prop, is_required=is_required) fields.append(field) extends = [] - package = packages.get_package(schema_name) + package = packages.get_package(namespace, schema_name) assert package diff --git a/experimental/python/codegen/codegen/generated_dataclass_patch.py b/experimental/python/codegen/codegen/generated_dataclass_patch.py index 50bcd3205a..9b2ae7ecba 100644 --- a/experimental/python/codegen/codegen/generated_dataclass_patch.py +++ b/experimental/python/codegen/codegen/generated_dataclass_patch.py @@ -37,10 +37,11 @@ class Bar: # see also _append_resolve_recursive_imports - models["jobs.ForEachTask"] = _quote_recursive_references_for_model( - models["jobs.ForEachTask"], - references={"Task", "TaskParam"}, - ) + if "jobs.ForEachTask" in models: + models["jobs.ForEachTask"] = _quote_recursive_references_for_model( + models["jobs.ForEachTask"], + references={"Task", "TaskParam"}, + ) def _quote_recursive_references_for_model( diff --git a/experimental/python/codegen/codegen/generated_enum.py b/experimental/python/codegen/codegen/generated_enum.py index 6b482a4072..7f413d5b38 100644 --- a/experimental/python/codegen/codegen/generated_enum.py +++ b/experimental/python/codegen/codegen/generated_enum.py @@ -17,11 +17,11 @@ class GeneratedEnum: experimental: bool -def generate_enum(schema_name: str, schema: Schema) -> GeneratedEnum: +def generate_enum(namespace: str, schema_name: str, schema: Schema) -> GeneratedEnum: assert schema.enum class_name = packages.get_class_name(schema_name) - package = packages.get_package(schema_name) + package = packages.get_package(namespace, schema_name) values = {} assert package diff --git a/experimental/python/codegen/codegen/generated_imports.py b/experimental/python/codegen/codegen/generated_imports.py index f9217b644f..527764ed39 100644 --- a/experimental/python/codegen/codegen/generated_imports.py +++ b/experimental/python/codegen/codegen/generated_imports.py @@ -11,9 +11,9 @@ def append_enum_imports( enums: dict[str, GeneratedEnum], exclude_packages: list[str], ) -> None: - for schema_name in enums.keys(): - package = packages.get_package(schema_name) - class_name = packages.get_class_name(schema_name) + for generated in enums.values(): + package = generated.package + class_name = generated.class_name if package in exclude_packages: continue @@ -26,9 +26,9 @@ def append_dataclass_imports( dataclasses: dict[str, GeneratedDataclass], exclude_packages: list[str], ) -> None: - for schema_name in dataclasses.keys(): - package = packages.get_package(schema_name) - class_name = packages.get_class_name(schema_name) + for generated in dataclasses.values(): + package = generated.package + class_name = generated.class_name if package in exclude_packages: continue diff --git a/experimental/python/codegen/codegen/main.py b/experimental/python/codegen/codegen/main.py index 3ef2b5d2ee..18a11cc627 100644 --- a/experimental/python/codegen/codegen/main.py +++ b/experimental/python/codegen/codegen/main.py @@ -28,20 +28,21 @@ def main(output: str): schemas = _remove_deprecated_fields(schemas) schemas = _remove_unused_schemas(packages.RESOURCE_TYPES, schemas) - dataclasses, enums = _generate_code(schemas) - - generated_dataclass_patch.reorder_required_fields(dataclasses) - generated_dataclass_patch.quote_recursive_references(dataclasses) + # each resource has own namespace and is generated separately so + # that there are no dependencies between namespaces as in Databricks SDK v1 + for resource, namespace in packages.RESOURCE_NAMESPACE.items(): + # only generate code for schemas used directly or transitively by resource + reachable = _collect_reachable_schemas([resource], schemas) + reachable_schemas = {k: v for k, v in schemas.items() if k in reachable} - _write_code(dataclasses, enums, output) + dataclasses, enums = _generate_code(namespace, reachable_schemas) - for resource in packages.RESOURCE_TYPES: - reachable = _collect_reachable_schemas([resource], schemas) + generated_dataclass_patch.reorder_required_fields(dataclasses) + generated_dataclass_patch.quote_recursive_references(dataclasses) - resource_dataclasses = {k: v for k, v in dataclasses.items() if k in reachable} - resource_enums = {k: v for k, v in enums.items() if k in reachable} + _write_code(dataclasses, enums, output) - _write_exports(resource, resource_dataclasses, resource_enums, output) + _write_exports(namespace, dataclasses, enums, output) def _transitively_mark_deprecated_and_private( @@ -95,6 +96,7 @@ def _remove_deprecated_fields( def _generate_code( + namespace: str, schemas: dict[str, openapi.Schema], ) -> tuple[dict[str, GeneratedDataclass], dict[str, GeneratedEnum]]: dataclasses = {} @@ -102,11 +104,13 @@ def _generate_code( for schema_name, schema in schemas.items(): if schema.type == openapi.SchemaType.OBJECT: - generated = generated_dataclass.generate_dataclass(schema_name, schema) + generated = generated_dataclass.generate_dataclass( + namespace, schema_name, schema + ) dataclasses[schema_name] = generated elif schema.type == openapi.SchemaType.STRING: - generated = generated_enum.generate_enum(schema_name, schema) + generated = generated_enum.generate_enum(namespace, schema_name, schema) enums[schema_name] = generated else: @@ -116,7 +120,7 @@ def _generate_code( def _write_exports( - root: str, + namespace: str, dataclasses: dict[str, GeneratedDataclass], enums: dict[str, GeneratedEnum], output: str, @@ -148,14 +152,11 @@ def _write_exports( generated_imports.append_enum_imports(b, enums, exclude_packages=[]) # FIXME should be better generalized - if root == "resources.Job": + if namespace == "jobs": _append_resolve_recursive_imports(b) - root_package = packages.get_package(root) - assert root_package - - # transform databricks.bundles.jobs._models.job -> databricks/bundles/jobs - package_path = Path(root_package.replace(".", "/")).parent.parent + root_package = packages.get_root_package(namespace) + package_path = Path(root_package.replace(".", "/")) source_path = Path(output) / package_path / "__init__.py" source_path.parent.mkdir(exist_ok=True, parents=True) diff --git a/experimental/python/codegen/codegen/packages.py b/experimental/python/codegen/codegen/packages.py index b81e9cad4d..d0a14396f2 100644 --- a/experimental/python/codegen/codegen/packages.py +++ b/experimental/python/codegen/codegen/packages.py @@ -1,20 +1,13 @@ import re from typing import Optional -RESOURCE_NAMESPACE_OVERRIDE = { +# All supported resource types and their namespace +RESOURCE_NAMESPACE = { "resources.Job": "jobs", "resources.Pipeline": "pipelines", - "resources.JobPermission": "jobs", - "resources.JobPermissionLevel": "jobs", - "resources.PipelinePermission": "pipelines", - "resources.PipelinePermissionLevel": "pipelines", } -# All supported resource types -RESOURCE_TYPES = [ - "resources.Job", - "resources.Pipeline", -] +RESOURCE_TYPES = list(RESOURCE_NAMESPACE.keys()) # Namespaces to load from OpenAPI spec. # @@ -72,7 +65,11 @@ def should_load_ref(ref: str) -> bool: return name in PRIMITIVES -def get_package(ref: str) -> Optional[str]: +def get_root_package(namespace: str) -> str: + return f"databricks.bundles.{namespace}" + + +def get_package(namespace: str, ref: str) -> Optional[str]: """ Returns Python package for a given OpenAPI ref. Returns None for builtin types. @@ -83,11 +80,7 @@ def get_package(ref: str) -> Optional[str]: if full_name in PRIMITIVES: return None - [namespace, name] = full_name.split(".") - - if override := RESOURCE_NAMESPACE_OVERRIDE.get(full_name): - namespace = override - + [_, name] = full_name.split(".") package_name = re.sub(r"(?@.dfs.core.windows.net/`. + """ + + @classmethod + def from_dict(cls, value: "Adlsgen2InfoDict") -> "Self": + return _transform(cls, value) + + def as_dict(self) -> "Adlsgen2InfoDict": + return _transform_to_json_value(self) # type:ignore + + +class Adlsgen2InfoDict(TypedDict, total=False): + """""" + + destination: VariableOr[str] + """ + abfss destination, e.g. `abfss://@.dfs.core.windows.net/`. + """ + + +Adlsgen2InfoParam = Adlsgen2InfoDict | Adlsgen2Info diff --git a/experimental/python/databricks/bundles/pipelines/_models/aws_attributes.py b/experimental/python/databricks/bundles/pipelines/_models/aws_attributes.py new file mode 100644 index 0000000000..542f55aa2f --- /dev/null +++ b/experimental/python/databricks/bundles/pipelines/_models/aws_attributes.py @@ -0,0 +1,216 @@ +from dataclasses import dataclass +from typing import TYPE_CHECKING, TypedDict + +from databricks.bundles.core._transform import _transform +from databricks.bundles.core._transform_to_json import _transform_to_json_value +from databricks.bundles.core._variable import VariableOrOptional +from databricks.bundles.pipelines._models.aws_availability import ( + AwsAvailability, + AwsAvailabilityParam, +) +from databricks.bundles.pipelines._models.ebs_volume_type import ( + EbsVolumeType, + EbsVolumeTypeParam, +) + +if TYPE_CHECKING: + from typing_extensions import Self + + +@dataclass(kw_only=True) +class AwsAttributes: + """ + Attributes set during cluster creation which are related to Amazon Web Services. + """ + + availability: VariableOrOptional[AwsAvailability] = None + + ebs_volume_count: VariableOrOptional[int] = None + """ + The number of volumes launched for each instance. Users can choose up to 10 volumes. + This feature is only enabled for supported node types. Legacy node types cannot specify + custom EBS volumes. + For node types with no instance store, at least one EBS volume needs to be specified; + otherwise, cluster creation will fail. + + These EBS volumes will be mounted at `/ebs0`, `/ebs1`, and etc. + Instance store volumes will be mounted at `/local_disk0`, `/local_disk1`, and etc. + + If EBS volumes are attached, Databricks will configure Spark to use only the EBS volumes for + scratch storage because heterogenously sized scratch devices can lead to inefficient disk + utilization. If no EBS volumes are attached, Databricks will configure Spark to use instance + store volumes. + + Please note that if EBS volumes are specified, then the Spark configuration `spark.local.dir` + will be overridden. + """ + + ebs_volume_iops: VariableOrOptional[int] = None + """ + If using gp3 volumes, what IOPS to use for the disk. If this is not set, the maximum performance of a gp2 volume with the same volume size will be used. + """ + + ebs_volume_size: VariableOrOptional[int] = None + """ + The size of each EBS volume (in GiB) launched for each instance. For general purpose + SSD, this value must be within the range 100 - 4096. For throughput optimized HDD, + this value must be within the range 500 - 4096. + """ + + ebs_volume_throughput: VariableOrOptional[int] = None + """ + If using gp3 volumes, what throughput to use for the disk. If this is not set, the maximum performance of a gp2 volume with the same volume size will be used. + """ + + ebs_volume_type: VariableOrOptional[EbsVolumeType] = None + + first_on_demand: VariableOrOptional[int] = None + """ + The first `first_on_demand` nodes of the cluster will be placed on on-demand instances. + If this value is greater than 0, the cluster driver node in particular will be placed on an + on-demand instance. If this value is greater than or equal to the current cluster size, all + nodes will be placed on on-demand instances. If this value is less than the current cluster + size, `first_on_demand` nodes will be placed on on-demand instances and the remainder will + be placed on `availability` instances. Note that this value does not affect + cluster size and cannot currently be mutated over the lifetime of a cluster. + """ + + instance_profile_arn: VariableOrOptional[str] = None + """ + Nodes for this cluster will only be placed on AWS instances with this instance profile. If + ommitted, nodes will be placed on instances without an IAM instance profile. The instance + profile must have previously been added to the Databricks environment by an account + administrator. + + This feature may only be available to certain customer plans. + """ + + spot_bid_price_percent: VariableOrOptional[int] = None + """ + The bid price for AWS spot instances, as a percentage of the corresponding instance type's + on-demand price. + For example, if this field is set to 50, and the cluster needs a new `r3.xlarge` spot + instance, then the bid price is half of the price of + on-demand `r3.xlarge` instances. Similarly, if this field is set to 200, the bid price is twice + the price of on-demand `r3.xlarge` instances. If not specified, the default value is 100. + When spot instances are requested for this cluster, only spot instances whose bid price + percentage matches this field will be considered. + Note that, for safety, we enforce this field to be no more than 10000. + """ + + zone_id: VariableOrOptional[str] = None + """ + Identifier for the availability zone/datacenter in which the cluster resides. + This string will be of a form like "us-west-2a". The provided availability + zone must be in the same region as the Databricks deployment. For example, "us-west-2a" + is not a valid zone id if the Databricks deployment resides in the "us-east-1" region. + This is an optional field at cluster creation, and if not specified, a default zone will be used. + If the zone specified is "auto", will try to place cluster in a zone with high availability, + and will retry placement in a different AZ if there is not enough capacity. + + The list of available zones as well as the default value can be found by using the + `List Zones` method. + """ + + @classmethod + def from_dict(cls, value: "AwsAttributesDict") -> "Self": + return _transform(cls, value) + + def as_dict(self) -> "AwsAttributesDict": + return _transform_to_json_value(self) # type:ignore + + +class AwsAttributesDict(TypedDict, total=False): + """""" + + availability: VariableOrOptional[AwsAvailabilityParam] + + ebs_volume_count: VariableOrOptional[int] + """ + The number of volumes launched for each instance. Users can choose up to 10 volumes. + This feature is only enabled for supported node types. Legacy node types cannot specify + custom EBS volumes. + For node types with no instance store, at least one EBS volume needs to be specified; + otherwise, cluster creation will fail. + + These EBS volumes will be mounted at `/ebs0`, `/ebs1`, and etc. + Instance store volumes will be mounted at `/local_disk0`, `/local_disk1`, and etc. + + If EBS volumes are attached, Databricks will configure Spark to use only the EBS volumes for + scratch storage because heterogenously sized scratch devices can lead to inefficient disk + utilization. If no EBS volumes are attached, Databricks will configure Spark to use instance + store volumes. + + Please note that if EBS volumes are specified, then the Spark configuration `spark.local.dir` + will be overridden. + """ + + ebs_volume_iops: VariableOrOptional[int] + """ + If using gp3 volumes, what IOPS to use for the disk. If this is not set, the maximum performance of a gp2 volume with the same volume size will be used. + """ + + ebs_volume_size: VariableOrOptional[int] + """ + The size of each EBS volume (in GiB) launched for each instance. For general purpose + SSD, this value must be within the range 100 - 4096. For throughput optimized HDD, + this value must be within the range 500 - 4096. + """ + + ebs_volume_throughput: VariableOrOptional[int] + """ + If using gp3 volumes, what throughput to use for the disk. If this is not set, the maximum performance of a gp2 volume with the same volume size will be used. + """ + + ebs_volume_type: VariableOrOptional[EbsVolumeTypeParam] + + first_on_demand: VariableOrOptional[int] + """ + The first `first_on_demand` nodes of the cluster will be placed on on-demand instances. + If this value is greater than 0, the cluster driver node in particular will be placed on an + on-demand instance. If this value is greater than or equal to the current cluster size, all + nodes will be placed on on-demand instances. If this value is less than the current cluster + size, `first_on_demand` nodes will be placed on on-demand instances and the remainder will + be placed on `availability` instances. Note that this value does not affect + cluster size and cannot currently be mutated over the lifetime of a cluster. + """ + + instance_profile_arn: VariableOrOptional[str] + """ + Nodes for this cluster will only be placed on AWS instances with this instance profile. If + ommitted, nodes will be placed on instances without an IAM instance profile. The instance + profile must have previously been added to the Databricks environment by an account + administrator. + + This feature may only be available to certain customer plans. + """ + + spot_bid_price_percent: VariableOrOptional[int] + """ + The bid price for AWS spot instances, as a percentage of the corresponding instance type's + on-demand price. + For example, if this field is set to 50, and the cluster needs a new `r3.xlarge` spot + instance, then the bid price is half of the price of + on-demand `r3.xlarge` instances. Similarly, if this field is set to 200, the bid price is twice + the price of on-demand `r3.xlarge` instances. If not specified, the default value is 100. + When spot instances are requested for this cluster, only spot instances whose bid price + percentage matches this field will be considered. + Note that, for safety, we enforce this field to be no more than 10000. + """ + + zone_id: VariableOrOptional[str] + """ + Identifier for the availability zone/datacenter in which the cluster resides. + This string will be of a form like "us-west-2a". The provided availability + zone must be in the same region as the Databricks deployment. For example, "us-west-2a" + is not a valid zone id if the Databricks deployment resides in the "us-east-1" region. + This is an optional field at cluster creation, and if not specified, a default zone will be used. + If the zone specified is "auto", will try to place cluster in a zone with high availability, + and will retry placement in a different AZ if there is not enough capacity. + + The list of available zones as well as the default value can be found by using the + `List Zones` method. + """ + + +AwsAttributesParam = AwsAttributesDict | AwsAttributes diff --git a/experimental/python/databricks/bundles/pipelines/_models/aws_availability.py b/experimental/python/databricks/bundles/pipelines/_models/aws_availability.py new file mode 100644 index 0000000000..5d87ffafba --- /dev/null +++ b/experimental/python/databricks/bundles/pipelines/_models/aws_availability.py @@ -0,0 +1,19 @@ +from enum import Enum +from typing import Literal + + +class AwsAvailability(Enum): + """ + Availability type used for all subsequent nodes past the `first_on_demand` ones. + + Note: If `first_on_demand` is zero, this availability type will be used for the entire cluster. + """ + + SPOT = "SPOT" + ON_DEMAND = "ON_DEMAND" + SPOT_WITH_FALLBACK = "SPOT_WITH_FALLBACK" + + +AwsAvailabilityParam = ( + Literal["SPOT", "ON_DEMAND", "SPOT_WITH_FALLBACK"] | AwsAvailability +) diff --git a/experimental/python/databricks/bundles/pipelines/_models/azure_attributes.py b/experimental/python/databricks/bundles/pipelines/_models/azure_attributes.py new file mode 100644 index 0000000000..f06f84e9c9 --- /dev/null +++ b/experimental/python/databricks/bundles/pipelines/_models/azure_attributes.py @@ -0,0 +1,90 @@ +from dataclasses import dataclass +from typing import TYPE_CHECKING, TypedDict + +from databricks.bundles.core._transform import _transform +from databricks.bundles.core._transform_to_json import _transform_to_json_value +from databricks.bundles.core._variable import VariableOrOptional +from databricks.bundles.pipelines._models.azure_availability import ( + AzureAvailability, + AzureAvailabilityParam, +) +from databricks.bundles.pipelines._models.log_analytics_info import ( + LogAnalyticsInfo, + LogAnalyticsInfoParam, +) + +if TYPE_CHECKING: + from typing_extensions import Self + + +@dataclass(kw_only=True) +class AzureAttributes: + """ + Attributes set during cluster creation which are related to Microsoft Azure. + """ + + availability: VariableOrOptional[AzureAvailability] = None + + first_on_demand: VariableOrOptional[int] = None + """ + The first `first_on_demand` nodes of the cluster will be placed on on-demand instances. + This value should be greater than 0, to make sure the cluster driver node is placed on an + on-demand instance. If this value is greater than or equal to the current cluster size, all + nodes will be placed on on-demand instances. If this value is less than the current cluster + size, `first_on_demand` nodes will be placed on on-demand instances and the remainder will + be placed on `availability` instances. Note that this value does not affect + cluster size and cannot currently be mutated over the lifetime of a cluster. + """ + + log_analytics_info: VariableOrOptional[LogAnalyticsInfo] = None + """ + Defines values necessary to configure and run Azure Log Analytics agent + """ + + spot_bid_max_price: VariableOrOptional[float] = None + """ + The max bid price to be used for Azure spot instances. + The Max price for the bid cannot be higher than the on-demand price of the instance. + If not specified, the default value is -1, which specifies that the instance cannot be evicted + on the basis of price, and only on the basis of availability. Further, the value should > 0 or -1. + """ + + @classmethod + def from_dict(cls, value: "AzureAttributesDict") -> "Self": + return _transform(cls, value) + + def as_dict(self) -> "AzureAttributesDict": + return _transform_to_json_value(self) # type:ignore + + +class AzureAttributesDict(TypedDict, total=False): + """""" + + availability: VariableOrOptional[AzureAvailabilityParam] + + first_on_demand: VariableOrOptional[int] + """ + The first `first_on_demand` nodes of the cluster will be placed on on-demand instances. + This value should be greater than 0, to make sure the cluster driver node is placed on an + on-demand instance. If this value is greater than or equal to the current cluster size, all + nodes will be placed on on-demand instances. If this value is less than the current cluster + size, `first_on_demand` nodes will be placed on on-demand instances and the remainder will + be placed on `availability` instances. Note that this value does not affect + cluster size and cannot currently be mutated over the lifetime of a cluster. + """ + + log_analytics_info: VariableOrOptional[LogAnalyticsInfoParam] + """ + Defines values necessary to configure and run Azure Log Analytics agent + """ + + spot_bid_max_price: VariableOrOptional[float] + """ + The max bid price to be used for Azure spot instances. + The Max price for the bid cannot be higher than the on-demand price of the instance. + If not specified, the default value is -1, which specifies that the instance cannot be evicted + on the basis of price, and only on the basis of availability. Further, the value should > 0 or -1. + """ + + +AzureAttributesParam = AzureAttributesDict | AzureAttributes diff --git a/experimental/python/databricks/bundles/pipelines/_models/azure_availability.py b/experimental/python/databricks/bundles/pipelines/_models/azure_availability.py new file mode 100644 index 0000000000..72d461d5d7 --- /dev/null +++ b/experimental/python/databricks/bundles/pipelines/_models/azure_availability.py @@ -0,0 +1,19 @@ +from enum import Enum +from typing import Literal + + +class AzureAvailability(Enum): + """ + Availability type used for all subsequent nodes past the `first_on_demand` ones. + Note: If `first_on_demand` is zero, this availability type will be used for the entire cluster. + """ + + SPOT_AZURE = "SPOT_AZURE" + ON_DEMAND_AZURE = "ON_DEMAND_AZURE" + SPOT_WITH_FALLBACK_AZURE = "SPOT_WITH_FALLBACK_AZURE" + + +AzureAvailabilityParam = ( + Literal["SPOT_AZURE", "ON_DEMAND_AZURE", "SPOT_WITH_FALLBACK_AZURE"] + | AzureAvailability +) diff --git a/experimental/python/databricks/bundles/pipelines/_models/cluster_log_conf.py b/experimental/python/databricks/bundles/pipelines/_models/cluster_log_conf.py new file mode 100644 index 0000000000..f0b4f98545 --- /dev/null +++ b/experimental/python/databricks/bundles/pipelines/_models/cluster_log_conf.py @@ -0,0 +1,82 @@ +from dataclasses import dataclass +from typing import TYPE_CHECKING, TypedDict + +from databricks.bundles.core._transform import _transform +from databricks.bundles.core._transform_to_json import _transform_to_json_value +from databricks.bundles.core._variable import VariableOrOptional +from databricks.bundles.pipelines._models.dbfs_storage_info import ( + DbfsStorageInfo, + DbfsStorageInfoParam, +) +from databricks.bundles.pipelines._models.s3_storage_info import ( + S3StorageInfo, + S3StorageInfoParam, +) +from databricks.bundles.pipelines._models.volumes_storage_info import ( + VolumesStorageInfo, + VolumesStorageInfoParam, +) + +if TYPE_CHECKING: + from typing_extensions import Self + + +@dataclass(kw_only=True) +class ClusterLogConf: + """ + Cluster log delivery config + """ + + dbfs: VariableOrOptional[DbfsStorageInfo] = None + """ + destination needs to be provided. e.g. + `{ "dbfs" : { "destination" : "dbfs:/home/cluster_log" } }` + """ + + s3: VariableOrOptional[S3StorageInfo] = None + """ + destination and either the region or endpoint need to be provided. e.g. + `{ "s3": { "destination" : "s3://cluster_log_bucket/prefix", "region" : "us-west-2" } }` + Cluster iam role is used to access s3, please make sure the cluster iam role in + `instance_profile_arn` has permission to write data to the s3 destination. + """ + + volumes: VariableOrOptional[VolumesStorageInfo] = None + """ + destination needs to be provided, e.g. + `{ "volumes": { "destination": "/Volumes/catalog/schema/volume/cluster_log" } }` + """ + + @classmethod + def from_dict(cls, value: "ClusterLogConfDict") -> "Self": + return _transform(cls, value) + + def as_dict(self) -> "ClusterLogConfDict": + return _transform_to_json_value(self) # type:ignore + + +class ClusterLogConfDict(TypedDict, total=False): + """""" + + dbfs: VariableOrOptional[DbfsStorageInfoParam] + """ + destination needs to be provided. e.g. + `{ "dbfs" : { "destination" : "dbfs:/home/cluster_log" } }` + """ + + s3: VariableOrOptional[S3StorageInfoParam] + """ + destination and either the region or endpoint need to be provided. e.g. + `{ "s3": { "destination" : "s3://cluster_log_bucket/prefix", "region" : "us-west-2" } }` + Cluster iam role is used to access s3, please make sure the cluster iam role in + `instance_profile_arn` has permission to write data to the s3 destination. + """ + + volumes: VariableOrOptional[VolumesStorageInfoParam] + """ + destination needs to be provided, e.g. + `{ "volumes": { "destination": "/Volumes/catalog/schema/volume/cluster_log" } }` + """ + + +ClusterLogConfParam = ClusterLogConfDict | ClusterLogConf diff --git a/experimental/python/databricks/bundles/pipelines/_models/dbfs_storage_info.py b/experimental/python/databricks/bundles/pipelines/_models/dbfs_storage_info.py new file mode 100644 index 0000000000..81fe319a65 --- /dev/null +++ b/experimental/python/databricks/bundles/pipelines/_models/dbfs_storage_info.py @@ -0,0 +1,40 @@ +from dataclasses import dataclass +from typing import TYPE_CHECKING, TypedDict + +from databricks.bundles.core._transform import _transform +from databricks.bundles.core._transform_to_json import _transform_to_json_value +from databricks.bundles.core._variable import VariableOr + +if TYPE_CHECKING: + from typing_extensions import Self + + +@dataclass(kw_only=True) +class DbfsStorageInfo: + """ + A storage location in DBFS + """ + + destination: VariableOr[str] + """ + dbfs destination, e.g. `dbfs:/my/path` + """ + + @classmethod + def from_dict(cls, value: "DbfsStorageInfoDict") -> "Self": + return _transform(cls, value) + + def as_dict(self) -> "DbfsStorageInfoDict": + return _transform_to_json_value(self) # type:ignore + + +class DbfsStorageInfoDict(TypedDict, total=False): + """""" + + destination: VariableOr[str] + """ + dbfs destination, e.g. `dbfs:/my/path` + """ + + +DbfsStorageInfoParam = DbfsStorageInfoDict | DbfsStorageInfo diff --git a/experimental/python/databricks/bundles/pipelines/_models/ebs_volume_type.py b/experimental/python/databricks/bundles/pipelines/_models/ebs_volume_type.py new file mode 100644 index 0000000000..b67853f8cb --- /dev/null +++ b/experimental/python/databricks/bundles/pipelines/_models/ebs_volume_type.py @@ -0,0 +1,17 @@ +from enum import Enum +from typing import Literal + + +class EbsVolumeType(Enum): + """ + All EBS volume types that Databricks supports. + See https://aws.amazon.com/ebs/details/ for details. + """ + + GENERAL_PURPOSE_SSD = "GENERAL_PURPOSE_SSD" + THROUGHPUT_OPTIMIZED_HDD = "THROUGHPUT_OPTIMIZED_HDD" + + +EbsVolumeTypeParam = ( + Literal["GENERAL_PURPOSE_SSD", "THROUGHPUT_OPTIMIZED_HDD"] | EbsVolumeType +) diff --git a/experimental/python/databricks/bundles/pipelines/_models/gcp_attributes.py b/experimental/python/databricks/bundles/pipelines/_models/gcp_attributes.py new file mode 100644 index 0000000000..1c94e0071f --- /dev/null +++ b/experimental/python/databricks/bundles/pipelines/_models/gcp_attributes.py @@ -0,0 +1,100 @@ +from dataclasses import dataclass +from typing import TYPE_CHECKING, TypedDict + +from databricks.bundles.core._transform import _transform +from databricks.bundles.core._transform_to_json import _transform_to_json_value +from databricks.bundles.core._variable import VariableOrOptional +from databricks.bundles.pipelines._models.gcp_availability import ( + GcpAvailability, + GcpAvailabilityParam, +) + +if TYPE_CHECKING: + from typing_extensions import Self + + +@dataclass(kw_only=True) +class GcpAttributes: + """ + Attributes set during cluster creation which are related to GCP. + """ + + availability: VariableOrOptional[GcpAvailability] = None + + boot_disk_size: VariableOrOptional[int] = None + """ + Boot disk size in GB + """ + + google_service_account: VariableOrOptional[str] = None + """ + If provided, the cluster will impersonate the google service account when accessing + gcloud services (like GCS). The google service account + must have previously been added to the Databricks environment by an account + administrator. + """ + + local_ssd_count: VariableOrOptional[int] = None + """ + If provided, each node (workers and driver) in the cluster will have this number of local SSDs attached. + Each local SSD is 375GB in size. + Refer to [GCP documentation](https://cloud.google.com/compute/docs/disks/local-ssd#choose_number_local_ssds) + for the supported number of local SSDs for each instance type. + """ + + zone_id: VariableOrOptional[str] = None + """ + Identifier for the availability zone in which the cluster resides. + This can be one of the following: + - "HA" => High availability, spread nodes across availability zones for a Databricks deployment region [default]. + - "AUTO" => Databricks picks an availability zone to schedule the cluster on. + - A GCP availability zone => Pick One of the available zones for (machine type + region) from + https://cloud.google.com/compute/docs/regions-zones. + """ + + @classmethod + def from_dict(cls, value: "GcpAttributesDict") -> "Self": + return _transform(cls, value) + + def as_dict(self) -> "GcpAttributesDict": + return _transform_to_json_value(self) # type:ignore + + +class GcpAttributesDict(TypedDict, total=False): + """""" + + availability: VariableOrOptional[GcpAvailabilityParam] + + boot_disk_size: VariableOrOptional[int] + """ + Boot disk size in GB + """ + + google_service_account: VariableOrOptional[str] + """ + If provided, the cluster will impersonate the google service account when accessing + gcloud services (like GCS). The google service account + must have previously been added to the Databricks environment by an account + administrator. + """ + + local_ssd_count: VariableOrOptional[int] + """ + If provided, each node (workers and driver) in the cluster will have this number of local SSDs attached. + Each local SSD is 375GB in size. + Refer to [GCP documentation](https://cloud.google.com/compute/docs/disks/local-ssd#choose_number_local_ssds) + for the supported number of local SSDs for each instance type. + """ + + zone_id: VariableOrOptional[str] + """ + Identifier for the availability zone in which the cluster resides. + This can be one of the following: + - "HA" => High availability, spread nodes across availability zones for a Databricks deployment region [default]. + - "AUTO" => Databricks picks an availability zone to schedule the cluster on. + - A GCP availability zone => Pick One of the available zones for (machine type + region) from + https://cloud.google.com/compute/docs/regions-zones. + """ + + +GcpAttributesParam = GcpAttributesDict | GcpAttributes diff --git a/experimental/python/databricks/bundles/pipelines/_models/gcp_availability.py b/experimental/python/databricks/bundles/pipelines/_models/gcp_availability.py new file mode 100644 index 0000000000..aa8e785a71 --- /dev/null +++ b/experimental/python/databricks/bundles/pipelines/_models/gcp_availability.py @@ -0,0 +1,19 @@ +from enum import Enum +from typing import Literal + + +class GcpAvailability(Enum): + """ + This field determines whether the instance pool will contain preemptible + VMs, on-demand VMs, or preemptible VMs with a fallback to on-demand VMs if the former is unavailable. + """ + + PREEMPTIBLE_GCP = "PREEMPTIBLE_GCP" + ON_DEMAND_GCP = "ON_DEMAND_GCP" + PREEMPTIBLE_WITH_FALLBACK_GCP = "PREEMPTIBLE_WITH_FALLBACK_GCP" + + +GcpAvailabilityParam = ( + Literal["PREEMPTIBLE_GCP", "ON_DEMAND_GCP", "PREEMPTIBLE_WITH_FALLBACK_GCP"] + | GcpAvailability +) diff --git a/experimental/python/databricks/bundles/pipelines/_models/gcs_storage_info.py b/experimental/python/databricks/bundles/pipelines/_models/gcs_storage_info.py new file mode 100644 index 0000000000..a5e6d51e6e --- /dev/null +++ b/experimental/python/databricks/bundles/pipelines/_models/gcs_storage_info.py @@ -0,0 +1,40 @@ +from dataclasses import dataclass +from typing import TYPE_CHECKING, TypedDict + +from databricks.bundles.core._transform import _transform +from databricks.bundles.core._transform_to_json import _transform_to_json_value +from databricks.bundles.core._variable import VariableOr + +if TYPE_CHECKING: + from typing_extensions import Self + + +@dataclass(kw_only=True) +class GcsStorageInfo: + """ + A storage location in Google Cloud Platform's GCS + """ + + destination: VariableOr[str] + """ + GCS destination/URI, e.g. `gs://my-bucket/some-prefix` + """ + + @classmethod + def from_dict(cls, value: "GcsStorageInfoDict") -> "Self": + return _transform(cls, value) + + def as_dict(self) -> "GcsStorageInfoDict": + return _transform_to_json_value(self) # type:ignore + + +class GcsStorageInfoDict(TypedDict, total=False): + """""" + + destination: VariableOr[str] + """ + GCS destination/URI, e.g. `gs://my-bucket/some-prefix` + """ + + +GcsStorageInfoParam = GcsStorageInfoDict | GcsStorageInfo diff --git a/experimental/python/databricks/bundles/pipelines/_models/ingestion_config.py b/experimental/python/databricks/bundles/pipelines/_models/ingestion_config.py index c452222df9..988227c43e 100644 --- a/experimental/python/databricks/bundles/pipelines/_models/ingestion_config.py +++ b/experimental/python/databricks/bundles/pipelines/_models/ingestion_config.py @@ -4,7 +4,10 @@ from databricks.bundles.core._transform import _transform from databricks.bundles.core._transform_to_json import _transform_to_json_value from databricks.bundles.core._variable import VariableOrOptional -from databricks.bundles.pipelines._models.report_spec import ReportSpec, ReportSpecParam +from databricks.bundles.pipelines._models.report_spec import ( + ReportSpec, + ReportSpecParam, +) from databricks.bundles.pipelines._models.schema_spec import SchemaSpec, SchemaSpecParam from databricks.bundles.pipelines._models.table_spec import TableSpec, TableSpecParam diff --git a/experimental/python/databricks/bundles/pipelines/_models/init_script_info.py b/experimental/python/databricks/bundles/pipelines/_models/init_script_info.py new file mode 100644 index 0000000000..91bc383e42 --- /dev/null +++ b/experimental/python/databricks/bundles/pipelines/_models/init_script_info.py @@ -0,0 +1,129 @@ +from dataclasses import dataclass +from typing import TYPE_CHECKING, TypedDict + +from databricks.bundles.core._transform import _transform +from databricks.bundles.core._transform_to_json import _transform_to_json_value +from databricks.bundles.core._variable import VariableOrOptional +from databricks.bundles.pipelines._models.adlsgen2_info import ( + Adlsgen2Info, + Adlsgen2InfoParam, +) +from databricks.bundles.pipelines._models.gcs_storage_info import ( + GcsStorageInfo, + GcsStorageInfoParam, +) +from databricks.bundles.pipelines._models.local_file_info import ( + LocalFileInfo, + LocalFileInfoParam, +) +from databricks.bundles.pipelines._models.s3_storage_info import ( + S3StorageInfo, + S3StorageInfoParam, +) +from databricks.bundles.pipelines._models.volumes_storage_info import ( + VolumesStorageInfo, + VolumesStorageInfoParam, +) +from databricks.bundles.pipelines._models.workspace_storage_info import ( + WorkspaceStorageInfo, + WorkspaceStorageInfoParam, +) + +if TYPE_CHECKING: + from typing_extensions import Self + + +@dataclass(kw_only=True) +class InitScriptInfo: + """ + Config for an individual init script + Next ID: 11 + """ + + abfss: VariableOrOptional[Adlsgen2Info] = None + """ + Contains the Azure Data Lake Storage destination path + """ + + file: VariableOrOptional[LocalFileInfo] = None + """ + destination needs to be provided, e.g. + `{ "file": { "destination": "file:/my/local/file.sh" } }` + """ + + gcs: VariableOrOptional[GcsStorageInfo] = None + """ + destination needs to be provided, e.g. + `{ "gcs": { "destination": "gs://my-bucket/file.sh" } }` + """ + + s3: VariableOrOptional[S3StorageInfo] = None + """ + destination and either the region or endpoint need to be provided. e.g. + `{ \"s3\": { \"destination\": \"s3://cluster_log_bucket/prefix\", \"region\": \"us-west-2\" } }` + Cluster iam role is used to access s3, please make sure the cluster iam role in + `instance_profile_arn` has permission to write data to the s3 destination. + """ + + volumes: VariableOrOptional[VolumesStorageInfo] = None + """ + destination needs to be provided. e.g. + `{ \"volumes\" : { \"destination\" : \"/Volumes/my-init.sh\" } }` + """ + + workspace: VariableOrOptional[WorkspaceStorageInfo] = None + """ + destination needs to be provided, e.g. + `{ "workspace": { "destination": "/cluster-init-scripts/setup-datadog.sh" } }` + """ + + @classmethod + def from_dict(cls, value: "InitScriptInfoDict") -> "Self": + return _transform(cls, value) + + def as_dict(self) -> "InitScriptInfoDict": + return _transform_to_json_value(self) # type:ignore + + +class InitScriptInfoDict(TypedDict, total=False): + """""" + + abfss: VariableOrOptional[Adlsgen2InfoParam] + """ + Contains the Azure Data Lake Storage destination path + """ + + file: VariableOrOptional[LocalFileInfoParam] + """ + destination needs to be provided, e.g. + `{ "file": { "destination": "file:/my/local/file.sh" } }` + """ + + gcs: VariableOrOptional[GcsStorageInfoParam] + """ + destination needs to be provided, e.g. + `{ "gcs": { "destination": "gs://my-bucket/file.sh" } }` + """ + + s3: VariableOrOptional[S3StorageInfoParam] + """ + destination and either the region or endpoint need to be provided. e.g. + `{ \"s3\": { \"destination\": \"s3://cluster_log_bucket/prefix\", \"region\": \"us-west-2\" } }` + Cluster iam role is used to access s3, please make sure the cluster iam role in + `instance_profile_arn` has permission to write data to the s3 destination. + """ + + volumes: VariableOrOptional[VolumesStorageInfoParam] + """ + destination needs to be provided. e.g. + `{ \"volumes\" : { \"destination\" : \"/Volumes/my-init.sh\" } }` + """ + + workspace: VariableOrOptional[WorkspaceStorageInfoParam] + """ + destination needs to be provided, e.g. + `{ "workspace": { "destination": "/cluster-init-scripts/setup-datadog.sh" } }` + """ + + +InitScriptInfoParam = InitScriptInfoDict | InitScriptInfo diff --git a/experimental/python/databricks/bundles/pipelines/_models/local_file_info.py b/experimental/python/databricks/bundles/pipelines/_models/local_file_info.py new file mode 100644 index 0000000000..70d6f25820 --- /dev/null +++ b/experimental/python/databricks/bundles/pipelines/_models/local_file_info.py @@ -0,0 +1,38 @@ +from dataclasses import dataclass +from typing import TYPE_CHECKING, TypedDict + +from databricks.bundles.core._transform import _transform +from databricks.bundles.core._transform_to_json import _transform_to_json_value +from databricks.bundles.core._variable import VariableOr + +if TYPE_CHECKING: + from typing_extensions import Self + + +@dataclass(kw_only=True) +class LocalFileInfo: + """""" + + destination: VariableOr[str] + """ + local file destination, e.g. `file:/my/local/file.sh` + """ + + @classmethod + def from_dict(cls, value: "LocalFileInfoDict") -> "Self": + return _transform(cls, value) + + def as_dict(self) -> "LocalFileInfoDict": + return _transform_to_json_value(self) # type:ignore + + +class LocalFileInfoDict(TypedDict, total=False): + """""" + + destination: VariableOr[str] + """ + local file destination, e.g. `file:/my/local/file.sh` + """ + + +LocalFileInfoParam = LocalFileInfoDict | LocalFileInfo diff --git a/experimental/python/databricks/bundles/pipelines/_models/log_analytics_info.py b/experimental/python/databricks/bundles/pipelines/_models/log_analytics_info.py new file mode 100644 index 0000000000..5eced870a1 --- /dev/null +++ b/experimental/python/databricks/bundles/pipelines/_models/log_analytics_info.py @@ -0,0 +1,48 @@ +from dataclasses import dataclass +from typing import TYPE_CHECKING, TypedDict + +from databricks.bundles.core._transform import _transform +from databricks.bundles.core._transform_to_json import _transform_to_json_value +from databricks.bundles.core._variable import VariableOrOptional + +if TYPE_CHECKING: + from typing_extensions import Self + + +@dataclass(kw_only=True) +class LogAnalyticsInfo: + """""" + + log_analytics_primary_key: VariableOrOptional[str] = None + """ + The primary key for the Azure Log Analytics agent configuration + """ + + log_analytics_workspace_id: VariableOrOptional[str] = None + """ + The workspace ID for the Azure Log Analytics agent configuration + """ + + @classmethod + def from_dict(cls, value: "LogAnalyticsInfoDict") -> "Self": + return _transform(cls, value) + + def as_dict(self) -> "LogAnalyticsInfoDict": + return _transform_to_json_value(self) # type:ignore + + +class LogAnalyticsInfoDict(TypedDict, total=False): + """""" + + log_analytics_primary_key: VariableOrOptional[str] + """ + The primary key for the Azure Log Analytics agent configuration + """ + + log_analytics_workspace_id: VariableOrOptional[str] + """ + The workspace ID for the Azure Log Analytics agent configuration + """ + + +LogAnalyticsInfoParam = LogAnalyticsInfoDict | LogAnalyticsInfo diff --git a/experimental/python/databricks/bundles/pipelines/_models/maven_library.py b/experimental/python/databricks/bundles/pipelines/_models/maven_library.py new file mode 100644 index 0000000000..45925700b8 --- /dev/null +++ b/experimental/python/databricks/bundles/pipelines/_models/maven_library.py @@ -0,0 +1,70 @@ +from dataclasses import dataclass, field +from typing import TYPE_CHECKING, TypedDict + +from databricks.bundles.core._transform import _transform +from databricks.bundles.core._transform_to_json import _transform_to_json_value +from databricks.bundles.core._variable import ( + VariableOr, + VariableOrList, + VariableOrOptional, +) + +if TYPE_CHECKING: + from typing_extensions import Self + + +@dataclass(kw_only=True) +class MavenLibrary: + """""" + + coordinates: VariableOr[str] + """ + Gradle-style maven coordinates. For example: "org.jsoup:jsoup:1.7.2". + """ + + exclusions: VariableOrList[str] = field(default_factory=list) + """ + List of dependences to exclude. For example: `["slf4j:slf4j", "*:hadoop-client"]`. + + Maven dependency exclusions: + https://maven.apache.org/guides/introduction/introduction-to-optional-and-excludes-dependencies.html. + """ + + repo: VariableOrOptional[str] = None + """ + Maven repo to install the Maven package from. If omitted, both Maven Central Repository + and Spark Packages are searched. + """ + + @classmethod + def from_dict(cls, value: "MavenLibraryDict") -> "Self": + return _transform(cls, value) + + def as_dict(self) -> "MavenLibraryDict": + return _transform_to_json_value(self) # type:ignore + + +class MavenLibraryDict(TypedDict, total=False): + """""" + + coordinates: VariableOr[str] + """ + Gradle-style maven coordinates. For example: "org.jsoup:jsoup:1.7.2". + """ + + exclusions: VariableOrList[str] + """ + List of dependences to exclude. For example: `["slf4j:slf4j", "*:hadoop-client"]`. + + Maven dependency exclusions: + https://maven.apache.org/guides/introduction/introduction-to-optional-and-excludes-dependencies.html. + """ + + repo: VariableOrOptional[str] + """ + Maven repo to install the Maven package from. If omitted, both Maven Central Repository + and Spark Packages are searched. + """ + + +MavenLibraryParam = MavenLibraryDict | MavenLibrary diff --git a/experimental/python/databricks/bundles/pipelines/_models/pipeline_cluster.py b/experimental/python/databricks/bundles/pipelines/_models/pipeline_cluster.py index 70de76b20d..01563f6a85 100644 --- a/experimental/python/databricks/bundles/pipelines/_models/pipeline_cluster.py +++ b/experimental/python/databricks/bundles/pipelines/_models/pipeline_cluster.py @@ -1,33 +1,33 @@ from dataclasses import dataclass, field from typing import TYPE_CHECKING, TypedDict -from databricks.bundles.compute._models.aws_attributes import ( +from databricks.bundles.core._transform import _transform +from databricks.bundles.core._transform_to_json import _transform_to_json_value +from databricks.bundles.core._variable import ( + VariableOrDict, + VariableOrList, + VariableOrOptional, +) +from databricks.bundles.pipelines._models.aws_attributes import ( AwsAttributes, AwsAttributesParam, ) -from databricks.bundles.compute._models.azure_attributes import ( +from databricks.bundles.pipelines._models.azure_attributes import ( AzureAttributes, AzureAttributesParam, ) -from databricks.bundles.compute._models.cluster_log_conf import ( +from databricks.bundles.pipelines._models.cluster_log_conf import ( ClusterLogConf, ClusterLogConfParam, ) -from databricks.bundles.compute._models.gcp_attributes import ( +from databricks.bundles.pipelines._models.gcp_attributes import ( GcpAttributes, GcpAttributesParam, ) -from databricks.bundles.compute._models.init_script_info import ( +from databricks.bundles.pipelines._models.init_script_info import ( InitScriptInfo, InitScriptInfoParam, ) -from databricks.bundles.core._transform import _transform -from databricks.bundles.core._transform_to_json import _transform_to_json_value -from databricks.bundles.core._variable import ( - VariableOrDict, - VariableOrList, - VariableOrOptional, -) from databricks.bundles.pipelines._models.pipeline_cluster_autoscale import ( PipelineClusterAutoscale, PipelineClusterAutoscaleParam, diff --git a/experimental/python/databricks/bundles/pipelines/_models/pipeline_library.py b/experimental/python/databricks/bundles/pipelines/_models/pipeline_library.py index 459cf3bd8b..fa5b9e09e1 100644 --- a/experimental/python/databricks/bundles/pipelines/_models/pipeline_library.py +++ b/experimental/python/databricks/bundles/pipelines/_models/pipeline_library.py @@ -1,10 +1,6 @@ from dataclasses import dataclass from typing import TYPE_CHECKING, TypedDict -from databricks.bundles.compute._models.maven_library import ( - MavenLibrary, - MavenLibraryParam, -) from databricks.bundles.core._transform import _transform from databricks.bundles.core._transform_to_json import _transform_to_json_value from databricks.bundles.core._variable import VariableOrOptional @@ -12,6 +8,10 @@ FileLibrary, FileLibraryParam, ) +from databricks.bundles.pipelines._models.maven_library import ( + MavenLibrary, + MavenLibraryParam, +) from databricks.bundles.pipelines._models.notebook_library import ( NotebookLibrary, NotebookLibraryParam, diff --git a/experimental/python/databricks/bundles/pipelines/_models/s3_storage_info.py b/experimental/python/databricks/bundles/pipelines/_models/s3_storage_info.py new file mode 100644 index 0000000000..b5e09063e5 --- /dev/null +++ b/experimental/python/databricks/bundles/pipelines/_models/s3_storage_info.py @@ -0,0 +1,122 @@ +from dataclasses import dataclass +from typing import TYPE_CHECKING, TypedDict + +from databricks.bundles.core._transform import _transform +from databricks.bundles.core._transform_to_json import _transform_to_json_value +from databricks.bundles.core._variable import VariableOr, VariableOrOptional + +if TYPE_CHECKING: + from typing_extensions import Self + + +@dataclass(kw_only=True) +class S3StorageInfo: + """ + A storage location in Amazon S3 + """ + + destination: VariableOr[str] + """ + S3 destination, e.g. `s3://my-bucket/some-prefix` Note that logs will be delivered using + cluster iam role, please make sure you set cluster iam role and the role has write access to the + destination. Please also note that you cannot use AWS keys to deliver logs. + """ + + canned_acl: VariableOrOptional[str] = None + """ + (Optional) Set canned access control list for the logs, e.g. `bucket-owner-full-control`. + If `canned_cal` is set, please make sure the cluster iam role has `s3:PutObjectAcl` permission on + the destination bucket and prefix. The full list of possible canned acl can be found at + http://docs.aws.amazon.com/AmazonS3/latest/dev/acl-overview.html#canned-acl. + Please also note that by default only the object owner gets full controls. If you are using cross account + role for writing data, you may want to set `bucket-owner-full-control` to make bucket owner able to + read the logs. + """ + + enable_encryption: VariableOrOptional[bool] = None + """ + (Optional) Flag to enable server side encryption, `false` by default. + """ + + encryption_type: VariableOrOptional[str] = None + """ + (Optional) The encryption type, it could be `sse-s3` or `sse-kms`. It will be used only when + encryption is enabled and the default type is `sse-s3`. + """ + + endpoint: VariableOrOptional[str] = None + """ + S3 endpoint, e.g. `https://s3-us-west-2.amazonaws.com`. Either region or endpoint needs to be set. + If both are set, endpoint will be used. + """ + + kms_key: VariableOrOptional[str] = None + """ + (Optional) Kms key which will be used if encryption is enabled and encryption type is set to `sse-kms`. + """ + + region: VariableOrOptional[str] = None + """ + S3 region, e.g. `us-west-2`. Either region or endpoint needs to be set. If both are set, + endpoint will be used. + """ + + @classmethod + def from_dict(cls, value: "S3StorageInfoDict") -> "Self": + return _transform(cls, value) + + def as_dict(self) -> "S3StorageInfoDict": + return _transform_to_json_value(self) # type:ignore + + +class S3StorageInfoDict(TypedDict, total=False): + """""" + + destination: VariableOr[str] + """ + S3 destination, e.g. `s3://my-bucket/some-prefix` Note that logs will be delivered using + cluster iam role, please make sure you set cluster iam role and the role has write access to the + destination. Please also note that you cannot use AWS keys to deliver logs. + """ + + canned_acl: VariableOrOptional[str] + """ + (Optional) Set canned access control list for the logs, e.g. `bucket-owner-full-control`. + If `canned_cal` is set, please make sure the cluster iam role has `s3:PutObjectAcl` permission on + the destination bucket and prefix. The full list of possible canned acl can be found at + http://docs.aws.amazon.com/AmazonS3/latest/dev/acl-overview.html#canned-acl. + Please also note that by default only the object owner gets full controls. If you are using cross account + role for writing data, you may want to set `bucket-owner-full-control` to make bucket owner able to + read the logs. + """ + + enable_encryption: VariableOrOptional[bool] + """ + (Optional) Flag to enable server side encryption, `false` by default. + """ + + encryption_type: VariableOrOptional[str] + """ + (Optional) The encryption type, it could be `sse-s3` or `sse-kms`. It will be used only when + encryption is enabled and the default type is `sse-s3`. + """ + + endpoint: VariableOrOptional[str] + """ + S3 endpoint, e.g. `https://s3-us-west-2.amazonaws.com`. Either region or endpoint needs to be set. + If both are set, endpoint will be used. + """ + + kms_key: VariableOrOptional[str] + """ + (Optional) Kms key which will be used if encryption is enabled and encryption type is set to `sse-kms`. + """ + + region: VariableOrOptional[str] + """ + S3 region, e.g. `us-west-2`. Either region or endpoint needs to be set. If both are set, + endpoint will be used. + """ + + +S3StorageInfoParam = S3StorageInfoDict | S3StorageInfo diff --git a/experimental/python/databricks/bundles/pipelines/_models/volumes_storage_info.py b/experimental/python/databricks/bundles/pipelines/_models/volumes_storage_info.py new file mode 100644 index 0000000000..cbe74758fc --- /dev/null +++ b/experimental/python/databricks/bundles/pipelines/_models/volumes_storage_info.py @@ -0,0 +1,42 @@ +from dataclasses import dataclass +from typing import TYPE_CHECKING, TypedDict + +from databricks.bundles.core._transform import _transform +from databricks.bundles.core._transform_to_json import _transform_to_json_value +from databricks.bundles.core._variable import VariableOr + +if TYPE_CHECKING: + from typing_extensions import Self + + +@dataclass(kw_only=True) +class VolumesStorageInfo: + """ + A storage location back by UC Volumes. + """ + + destination: VariableOr[str] + """ + UC Volumes destination, e.g. `/Volumes/catalog/schema/vol1/init-scripts/setup-datadog.sh` + or `dbfs:/Volumes/catalog/schema/vol1/init-scripts/setup-datadog.sh` + """ + + @classmethod + def from_dict(cls, value: "VolumesStorageInfoDict") -> "Self": + return _transform(cls, value) + + def as_dict(self) -> "VolumesStorageInfoDict": + return _transform_to_json_value(self) # type:ignore + + +class VolumesStorageInfoDict(TypedDict, total=False): + """""" + + destination: VariableOr[str] + """ + UC Volumes destination, e.g. `/Volumes/catalog/schema/vol1/init-scripts/setup-datadog.sh` + or `dbfs:/Volumes/catalog/schema/vol1/init-scripts/setup-datadog.sh` + """ + + +VolumesStorageInfoParam = VolumesStorageInfoDict | VolumesStorageInfo diff --git a/experimental/python/databricks/bundles/pipelines/_models/workspace_storage_info.py b/experimental/python/databricks/bundles/pipelines/_models/workspace_storage_info.py new file mode 100644 index 0000000000..29c075ac2f --- /dev/null +++ b/experimental/python/databricks/bundles/pipelines/_models/workspace_storage_info.py @@ -0,0 +1,40 @@ +from dataclasses import dataclass +from typing import TYPE_CHECKING, TypedDict + +from databricks.bundles.core._transform import _transform +from databricks.bundles.core._transform_to_json import _transform_to_json_value +from databricks.bundles.core._variable import VariableOr + +if TYPE_CHECKING: + from typing_extensions import Self + + +@dataclass(kw_only=True) +class WorkspaceStorageInfo: + """ + A storage location in Workspace Filesystem (WSFS) + """ + + destination: VariableOr[str] + """ + wsfs destination, e.g. `workspace:/cluster-init-scripts/setup-datadog.sh` + """ + + @classmethod + def from_dict(cls, value: "WorkspaceStorageInfoDict") -> "Self": + return _transform(cls, value) + + def as_dict(self) -> "WorkspaceStorageInfoDict": + return _transform_to_json_value(self) # type:ignore + + +class WorkspaceStorageInfoDict(TypedDict, total=False): + """""" + + destination: VariableOr[str] + """ + wsfs destination, e.g. `workspace:/cluster-init-scripts/setup-datadog.sh` + """ + + +WorkspaceStorageInfoParam = WorkspaceStorageInfoDict | WorkspaceStorageInfo From db12e9f84a84f4a96376c4c264475cfea2916848 Mon Sep 17 00:00:00 2001 From: Gleb Kanterov Date: Fri, 15 Aug 2025 10:59:43 +0200 Subject: [PATCH 2/3] Add changelog --- NEXT_CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/NEXT_CHANGELOG.md b/NEXT_CHANGELOG.md index d7997c825d..4179612fc8 100644 --- a/NEXT_CHANGELOG.md +++ b/NEXT_CHANGELOG.md @@ -3,6 +3,7 @@ ## Release v0.265.0 ### Notable Changes +* Separate generated classes between jobs and pipelines in Python support ([#3428](https://github.com/databricks/cli/pull/3428)) ### Dependency updates * Upgrade TF provider to 1.87.0 ([#3430](https://github.com/databricks/cli/pull/3430)) From e57f1c91b72cafb25dad44f80cd4a31007dfaa5d Mon Sep 17 00:00:00 2001 From: Gleb Kanterov Date: Mon, 18 Aug 2025 18:08:32 +0200 Subject: [PATCH 3/3] Update generated code --- .../pipelines/_models/gcp_attributes.py | 22 +++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/experimental/python/databricks/bundles/pipelines/_models/gcp_attributes.py b/experimental/python/databricks/bundles/pipelines/_models/gcp_attributes.py index 1c94e0071f..53b2e9c76d 100644 --- a/experimental/python/databricks/bundles/pipelines/_models/gcp_attributes.py +++ b/experimental/python/databricks/bundles/pipelines/_models/gcp_attributes.py @@ -26,6 +26,17 @@ class GcpAttributes: Boot disk size in GB """ + first_on_demand: VariableOrOptional[int] = None + """ + The first `first_on_demand` nodes of the cluster will be placed on on-demand instances. + This value should be greater than 0, to make sure the cluster driver node is placed on an + on-demand instance. If this value is greater than or equal to the current cluster size, all + nodes will be placed on on-demand instances. If this value is less than the current cluster + size, `first_on_demand` nodes will be placed on on-demand instances and the remainder will + be placed on `availability` instances. Note that this value does not affect + cluster size and cannot currently be mutated over the lifetime of a cluster. + """ + google_service_account: VariableOrOptional[str] = None """ If provided, the cluster will impersonate the google service account when accessing @@ -70,6 +81,17 @@ class GcpAttributesDict(TypedDict, total=False): Boot disk size in GB """ + first_on_demand: VariableOrOptional[int] + """ + The first `first_on_demand` nodes of the cluster will be placed on on-demand instances. + This value should be greater than 0, to make sure the cluster driver node is placed on an + on-demand instance. If this value is greater than or equal to the current cluster size, all + nodes will be placed on on-demand instances. If this value is less than the current cluster + size, `first_on_demand` nodes will be placed on on-demand instances and the remainder will + be placed on `availability` instances. Note that this value does not affect + cluster size and cannot currently be mutated over the lifetime of a cluster. + """ + google_service_account: VariableOrOptional[str] """ If provided, the cluster will impersonate the google service account when accessing