|
30 | 30 | from google.cloud import dataproc_v1 as dataproc |
31 | 31 | from google.cloud import storage |
32 | 32 |
|
33 | | - |
34 | | -def submit_job(project_id: str, region: str, cluster_name: str) -> None: |
35 | | - """Submits a Spark job to the specified Dataproc cluster with a driver node group and prints the output. |
36 | | -
|
37 | | - Args: |
38 | | - project_id: The Google Cloud project ID. |
39 | | - region: The Dataproc region where the cluster is located. |
40 | | - cluster_name: The name of the Dataproc cluster. |
41 | | - """ |
42 | | - # Create the job client. |
43 | | - with dataproc.JobControllerClient( |
44 | | - client_options={"api_endpoint": f"{region}-dataproc.googleapis.com:443"} |
45 | | - ) as job_client: |
46 | | - |
47 | | - driver_scheduling_config = dataproc.DriverSchedulingConfig( |
48 | | - memory_mb=2048, # Example memory in MB |
49 | | - vcores=2, # Example number of vcores |
50 | | - ) |
51 | | - |
52 | | - # Create the job config. 'main_jar_file_uri' can also be a |
53 | | - # Google Cloud Storage URL. |
54 | | - job = { |
55 | | - "placement": {"cluster_name": cluster_name}, |
56 | | - "spark_job": { |
57 | | - "main_class": "org.apache.spark.examples.SparkPi", |
58 | | - "jar_file_uris": ["file:///usr/lib/spark/examples/jars/spark-examples.jar"], |
59 | | - "args": ["1000"], |
60 | | - }, |
61 | | - "driver_scheduling_config": driver_scheduling_config |
62 | | - } |
63 | | - |
64 | | - operation = job_client.submit_job_as_operation( |
65 | | - request={"project_id": project_id, "region": region, "job": job} |
66 | | - ) |
67 | | - |
68 | | - response = operation.result() |
69 | | - |
70 | | - # Dataproc job output gets saved to the Cloud Storage bucket |
71 | | - # allocated to the job. Use a regex to obtain the bucket and blob info. |
72 | | - matches = re.match("gs://(.*?)/(.*)", response.driver_output_resource_uri) |
73 | | - if not matches: |
74 | | - print(f"Error: Could not parse driver output URI: {response.driver_output_resource_uri}") |
75 | | - raise ValueError |
76 | | - |
77 | | - #!/usr/bin/env python |
78 | | - |
79 | | -# Copyright 2025 Google LLC |
80 | | -# |
81 | | -# Licensed under the Apache License, Version 2.0 (the "License"); |
82 | | -# you may not use this file except in compliance with the License. |
83 | | -# You may obtain a copy of the License at |
84 | | -# |
85 | | -# http://www.apache.org/licenses/LICENSE-2.0 |
86 | | -# |
87 | | -# Unless required by applicable law or agreed to in writing, software |
88 | | -# distributed under the License is distributed on an "AS IS" BASIS, |
89 | | -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
90 | | -# See the License for the specific language governing permissions and |
91 | | -# limitations under the License. |
92 | | - |
93 | | -# This sample walks a user through submitting a Spark job to a |
94 | | -# Dataproc driver node group cluster using the Dataproc |
95 | | -# client library. |
96 | | - |
97 | | -# Usage: |
98 | | -# python submit_spark_job_to_driver_node_group_cluster.py \ |
99 | | -# --project_id <PROJECT_ID> --region <REGION> \ |
100 | | -# --cluster_name <CLUSTER_NAME> |
101 | | - |
102 | | -# [START dataproc_submit_spark_job_to_driver_node_group_cluster] |
103 | | - |
104 | | -import re |
105 | | - |
106 | | -from google.cloud import dataproc_v1 as dataproc |
107 | | -from google.cloud import storage |
108 | | - |
109 | 33 | def submit_job(project_id: str, region: str, cluster_name: str) -> None: |
110 | 34 | """Submits a Spark job to the specified Dataproc cluster with a driver node group and prints the output. |
111 | 35 |
|
@@ -162,23 +86,6 @@ def submit_job(project_id: str, region: str, cluster_name: str) -> None: |
162 | 86 | # [END dataproc_submit_spark_job_to_driver_node_group_cluster] |
163 | 87 |
|
164 | 88 |
|
165 | | -if __name__ == "__main__": |
166 | | - import argparse |
167 | | - |
168 | | - parser = argparse.ArgumentParser( |
169 | | - description="Submits a Spark job to a Dataproc driver node group cluster." |
170 | | - ) |
171 | | - parser.add_argument("--project_id", help="The Google Cloud project ID.", required=True) |
172 | | - parser.add_argument("--region", help="The Dataproc region where the cluster is located.", required=True) |
173 | | - parser.add_argument("--cluster_name", help="The name of the Dataproc cluster.", required=True) |
174 | | - |
175 | | - args = parser.parse_args() |
176 | | - submit_job(args.project_id, args.region, args.cluster_name) |
177 | | - |
178 | | - |
179 | | -# [END dataproc_submit_spark_job_to_driver_node_group_cluster] |
180 | | - |
181 | | - |
182 | 89 | if __name__ == "__main__": |
183 | 90 | import argparse |
184 | 91 |
|
|
0 commit comments