From dcb6f32c6598d54de1692218318e2ae5f1f805db Mon Sep 17 00:00:00 2001 From: dervoeti Date: Mon, 4 Aug 2025 12:13:01 +0200 Subject: [PATCH 01/26] CRD update --- deploy/helm/trino-operator/crds/crds.yaml | 316 ++++++++++++++++++++++ 1 file changed, 316 insertions(+) diff --git a/deploy/helm/trino-operator/crds/crds.yaml b/deploy/helm/trino-operator/crds/crds.yaml index f52c41b2..d0b76424 100644 --- a/deploy/helm/trino-operator/crds/crds.yaml +++ b/deploy/helm/trino-operator/crds/crds.yaml @@ -105,6 +105,322 @@ spec: description: matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is "key", the operator is "In", and the values array contains only "value". The requirements are ANDed. type: object type: object + faultTolerantExecution: + description: Fault tolerant execution configuration. When enabled, Trino can automatically retry queries or tasks in case of failures. + nullable: true + properties: + exchangeDeduplicationBufferSize: + description: Data size of the coordinator's in-memory buffer used to store output of query stages. + nullable: true + type: string + exchangeEncryptionEnabled: + description: Whether to enable encryption of spooling data. + nullable: true + type: boolean + exchangeManager: + description: Exchange manager configuration for spooling intermediate data during fault tolerant execution. Required when using `TASK` retry policy, optional for `QUERY` retry policy. + nullable: true + oneOf: + - required: + - s3 + - required: + - azure + - required: + - hdfs + - required: + - local + properties: + azure: + description: Azure Blob Storage configuration. + properties: + baseDirectories: + description: Azure Blob Storage container URIs for spooling data. + items: + type: string + type: array + blockSize: + description: Block data size for Azure block blob parallel upload. + nullable: true + type: string + endpoint: + description: Azure blob endpoint URL (optional, used instead of connection string). + nullable: true + type: string + maxErrorRetries: + description: Maximum number of times the Azure client should retry a request. + format: uint32 + minimum: 0.0 + nullable: true + type: integer + secretClass: + description: Secret class containing the Azure `connectionString`. + type: string + required: + - baseDirectories + - secretClass + type: object + hdfs: + description: HDFS-based exchange manager. + properties: + baseDirectories: + description: HDFS URIs for spooling data. + items: + type: string + type: array + blockSize: + description: Block data size for HDFS storage. + nullable: true + type: string + hdfs: + description: HDFS connection configuration. + properties: + configMap: + description: Name of the [discovery ConfigMap](https://docs.stackable.tech/home/nightly/concepts/service_discovery) providing information about the HDFS cluster. + type: string + required: + - configMap + type: object + skipDirectorySchemeValidation: + description: Skip directory scheme validation to support Hadoop-compatible file systems. + nullable: true + type: boolean + required: + - baseDirectories + - hdfs + type: object + local: + description: Local filesystem storage (not recommended for production). + properties: + baseDirectories: + description: Local filesystem paths for exchange storage. + items: + type: string + type: array + required: + - baseDirectories + type: object + s3: + description: S3-compatible storage configuration (includes AWS S3, MinIO, GCS). + properties: + baseDirectories: + description: S3 bucket URIs for spooling data (e.g., s3://bucket1,s3://bucket2). For GCS, use gs:// URIs (e.g., gs://bucket1,gs://bucket2). + items: + type: string + type: array + connection: + description: S3 connection configuration. Learn more about S3 configuration in the [S3 concept docs](https://docs.stackable.tech/home/nightly/concepts/s3). + oneOf: + - required: + - inline + - required: + - reference + properties: + inline: + description: S3 connection definition as a resource. Learn more on the [S3 concept documentation](https://docs.stackable.tech/home/nightly/concepts/s3). + properties: + accessStyle: + default: VirtualHosted + description: Which access style to use. Defaults to virtual hosted-style as most of the data products out there. Have a look at the [AWS documentation](https://docs.aws.amazon.com/AmazonS3/latest/userguide/VirtualHosting.html). + enum: + - Path + - VirtualHosted + type: string + credentials: + description: If the S3 uses authentication you have to specify you S3 credentials. In the most cases a [SecretClass](https://docs.stackable.tech/home/nightly/secret-operator/secretclass) providing `accessKey` and `secretKey` is sufficient. + nullable: true + properties: + scope: + description: '[Scope](https://docs.stackable.tech/home/nightly/secret-operator/scope) of the [SecretClass](https://docs.stackable.tech/home/nightly/secret-operator/secretclass).' + nullable: true + properties: + listenerVolumes: + default: [] + description: The listener volume scope allows Node and Service scopes to be inferred from the applicable listeners. This must correspond to Volume names in the Pod that mount Listeners. + items: + type: string + type: array + node: + default: false + description: The node scope is resolved to the name of the Kubernetes Node object that the Pod is running on. This will typically be the DNS name of the node. + type: boolean + pod: + default: false + description: The pod scope is resolved to the name of the Kubernetes Pod. This allows the secret to differentiate between StatefulSet replicas. + type: boolean + services: + default: [] + description: The service scope allows Pod objects to specify custom scopes. This should typically correspond to Service objects that the Pod participates in. + items: + type: string + type: array + type: object + secretClass: + description: '[SecretClass](https://docs.stackable.tech/home/nightly/secret-operator/secretclass) containing the LDAP bind credentials.' + type: string + required: + - secretClass + type: object + host: + description: 'Host of the S3 server without any protocol or port. For example: `west1.my-cloud.com`.' + type: string + port: + description: Port the S3 server listens on. If not specified the product will determine the port to use. + format: uint16 + minimum: 0.0 + nullable: true + type: integer + region: + default: + name: us-east-1 + description: |- + Bucket region used for signing headers (sigv4). + + This defaults to `us-east-1` which is compatible with other implementations such as Minio. + + WARNING: Some products use the Hadoop S3 implementation which falls back to us-east-2. + properties: + name: + default: us-east-1 + type: string + type: object + tls: + description: Use a TLS connection. If not specified no TLS will be used. + nullable: true + properties: + verification: + description: The verification method used to verify the certificates of the server and/or the client. + oneOf: + - required: + - none + - required: + - server + properties: + none: + description: Use TLS but don't verify certificates. + type: object + server: + description: Use TLS and a CA certificate to verify the server. + properties: + caCert: + description: CA cert to verify the server. + oneOf: + - required: + - webPki + - required: + - secretClass + properties: + secretClass: + description: Name of the [SecretClass](https://docs.stackable.tech/home/nightly/secret-operator/secretclass) which will provide the CA certificate. Note that a SecretClass does not need to have a key but can also work with just a CA certificate, so if you got provided with a CA cert but don't have access to the key you can still use this method. + type: string + webPki: + description: Use TLS and the CA certificates trusted by the common web browsers to verify the server. This can be useful when you e.g. use public AWS S3 or other public available services. + type: object + type: object + required: + - caCert + type: object + type: object + required: + - verification + type: object + required: + - host + type: object + reference: + type: string + type: object + externalId: + description: External ID for the IAM role trust policy. + nullable: true + type: string + gcsServiceAccountKey: + description: Google Cloud Storage service account key in JSON format. Required when using GCS (gs:// URIs). Should contain the JSON service account key. The operator will mount this as a file and configure `exchange.gcs.json-key-file-path`. + nullable: true + properties: + key: + description: Key name in the secret that contains the JSON service account key. + nullable: true + type: string + secretClass: + description: Secret class containing the GCS service account key. The secret should contain a key with the JSON service account key data. + type: string + required: + - secretClass + type: object + iamRole: + description: IAM role to assume for S3 access. + nullable: true + type: string + maxErrorRetries: + description: Maximum number of times the S3 client should retry a request. + format: uint32 + minimum: 0.0 + nullable: true + type: integer + uploadPartSize: + description: Part data size for S3 multi-part upload. + nullable: true + type: string + required: + - baseDirectories + - connection + type: object + sinkBufferPoolMinSize: + description: The minimum buffer pool size for an exchange sink. The larger the buffer pool size, the larger the write parallelism and memory usage. + format: uint32 + minimum: 0.0 + nullable: true + type: integer + sinkBuffersPerPartition: + description: The number of buffers per partition in the buffer pool. The larger the buffer pool size, the larger the write parallelism and memory usage. + format: uint32 + minimum: 0.0 + nullable: true + type: integer + sinkMaxFileSize: + description: Max data size of files written by exchange sinks. + nullable: true + type: string + sourceConcurrentReaders: + description: Number of concurrent readers to read from spooling storage. The larger the number of concurrent readers, the larger the read parallelism and memory usage. + format: uint32 + minimum: 0.0 + nullable: true + type: integer + type: object + queryRetryAttempts: + description: Maximum number of times Trino may attempt to retry a query before declaring it failed. Only applies to `QUERY` retry policy. + format: uint32 + minimum: 0.0 + nullable: true + type: integer + retryDelayScaleFactor: + description: Factor by which retry delay is increased on each query or task failure. + nullable: true + type: string + retryInitialDelay: + description: Minimum time that a failed query or task must wait before it is retried. + nullable: true + type: string + retryMaxDelay: + description: Maximum time that a failed query or task must wait before it is retried. + nullable: true + type: string + retryPolicy: + description: The retry policy for fault tolerant execution. `QUERY` retries entire queries, `TASK` retries individual tasks. When set to `TASK`, an exchange manager must be configured. + enum: + - QUERY + - TASK + type: string + taskRetryAttemptsPerTask: + description: Maximum number of times Trino may attempt to retry a single task before declaring the query failed. Only applies to `TASK` retry policy. + format: uint32 + minimum: 0.0 + nullable: true + type: integer + required: + - retryPolicy + type: object tls: default: internalSecretClass: tls From d3f9322692f5bcf7758cdae70db6822caddf18c7 Mon Sep 17 00:00:00 2001 From: dervoeti Date: Tue, 5 Aug 2025 19:17:16 +0200 Subject: [PATCH 02/26] feat: fault tolerant execution --- .github/ISSUE_TEMPLATE/02-bug_report.yml | 2 +- deploy/helm/trino-operator/crds/crds.yaml | 10 +- rust/operator-binary/src/command.rs | 18 +- rust/operator-binary/src/controller.rs | 85 +- .../src/crd/fault_tolerant_execution.rs | 826 ++++++++++++++++++ rust/operator-binary/src/crd/mod.rs | 8 + 6 files changed, 938 insertions(+), 11 deletions(-) create mode 100644 rust/operator-binary/src/crd/fault_tolerant_execution.rs diff --git a/.github/ISSUE_TEMPLATE/02-bug_report.yml b/.github/ISSUE_TEMPLATE/02-bug_report.yml index 88f66efd..bdfbd730 100644 --- a/.github/ISSUE_TEMPLATE/02-bug_report.yml +++ b/.github/ISSUE_TEMPLATE/02-bug_report.yml @@ -16,7 +16,7 @@ body: attributes: label: Affected Trino version description: Which version of Trino do you see this bug in? -# +# - type: textarea attributes: label: Current and expected behavior diff --git a/deploy/helm/trino-operator/crds/crds.yaml b/deploy/helm/trino-operator/crds/crds.yaml index d0b76424..aa8dfd02 100644 --- a/deploy/helm/trino-operator/crds/crds.yaml +++ b/deploy/helm/trino-operator/crds/crds.yaml @@ -146,6 +146,10 @@ spec: description: Azure blob endpoint URL (optional, used instead of connection string). nullable: true type: string + key: + description: Key name in the Secret that contains the connection string. + nullable: true + type: string maxErrorRetries: description: Maximum number of times the Azure client should retry a request. format: uint32 @@ -153,7 +157,7 @@ spec: nullable: true type: integer secretClass: - description: Secret class containing the Azure `connectionString`. + description: '[SecretClass](https://docs.stackable.tech/home/nightly/secret-operator/secretclass) providing the Azure connection string.' type: string required: - baseDirectories @@ -338,11 +342,11 @@ spec: nullable: true properties: key: - description: Key name in the secret that contains the JSON service account key. + description: Key name in the Secret that contains the JSON service account key. nullable: true type: string secretClass: - description: Secret class containing the GCS service account key. The secret should contain a key with the JSON service account key data. + description: '[SecretClass](https://docs.stackable.tech/home/nightly/secret-operator/secretclass) providing the GCS service account key.' type: string required: - secretClass diff --git a/rust/operator-binary/src/command.rs b/rust/operator-binary/src/command.rs index 6c80f5e9..4b6791d5 100644 --- a/rust/operator-binary/src/command.rs +++ b/rust/operator-binary/src/command.rs @@ -14,7 +14,8 @@ use crate::{ CONFIG_DIR_NAME, Container, LOG_PROPERTIES, RW_CONFIG_DIR_NAME, STACKABLE_CLIENT_TLS_DIR, STACKABLE_INTERNAL_TLS_DIR, STACKABLE_MOUNT_INTERNAL_TLS_DIR, STACKABLE_MOUNT_SERVER_TLS_DIR, STACKABLE_SERVER_TLS_DIR, STACKABLE_TLS_STORE_PASSWORD, - SYSTEM_TRUST_STORE, SYSTEM_TRUST_STORE_PASSWORD, TrinoRole, v1alpha1, + SYSTEM_TRUST_STORE, SYSTEM_TRUST_STORE_PASSWORD, TrinoRole, + fault_tolerant_execution::ResolvedFaultTolerantExecutionConfig, v1alpha1, }, }; @@ -22,6 +23,7 @@ pub fn container_prepare_args( trino: &v1alpha1::TrinoCluster, catalogs: &[CatalogConfig], merged_config: &v1alpha1::TrinoConfig, + resolved_fte_config: &Option, ) -> Vec { let mut args = vec![]; @@ -78,12 +80,18 @@ pub fn container_prepare_args( args.extend_from_slice(&catalog.init_container_extra_start_commands); }); + // Add the commands that are needed for fault tolerant execution (e.g., TLS certificates for S3) + if let Some(resolved_fte) = resolved_fte_config { + args.extend_from_slice(&resolved_fte.init_container_extra_start_commands); + } + args } pub fn container_trino_args( authentication_config: &TrinoAuthenticationConfig, catalogs: &[CatalogConfig], + resolved_fte_config: &Option, ) -> Vec { let mut args = vec![ // copy config files to a writeable empty folder @@ -110,6 +118,14 @@ pub fn container_trino_args( args.push(format!("export {env_name}=\"$(cat {file})\"")); } }); + + // Add fault tolerant execution environment variables from files + if let Some(resolved_fte) = resolved_fte_config { + for (env_name, file) in &resolved_fte.load_env_from_files { + args.push(format!("export {env_name}=\"$(cat {file})\"")); + } + } + args.push("set -x".to_string()); // Start command diff --git a/rust/operator-binary/src/controller.rs b/rust/operator-binary/src/controller.rs index 236cb20f..c30b1b3b 100644 --- a/rust/operator-binary/src/controller.rs +++ b/rust/operator-binary/src/controller.rs @@ -78,14 +78,16 @@ use crate::{ command, config, crd::{ ACCESS_CONTROL_PROPERTIES, APP_NAME, CONFIG_DIR_NAME, CONFIG_PROPERTIES, Container, - DISCOVERY_URI, ENV_INTERNAL_SECRET, HTTP_PORT, HTTP_PORT_NAME, HTTPS_PORT, HTTPS_PORT_NAME, - JVM_CONFIG, JVM_SECURITY_PROPERTIES, LOG_PROPERTIES, MAX_TRINO_LOG_FILES_SIZE, - METRICS_PORT, METRICS_PORT_NAME, NODE_PROPERTIES, RW_CONFIG_DIR_NAME, - STACKABLE_CLIENT_TLS_DIR, STACKABLE_INTERNAL_TLS_DIR, STACKABLE_MOUNT_INTERNAL_TLS_DIR, - STACKABLE_MOUNT_SERVER_TLS_DIR, STACKABLE_SERVER_TLS_DIR, TrinoRole, + DISCOVERY_URI, ENV_INTERNAL_SECRET, EXCHANGE_MANAGER_PROPERTIES, HTTP_PORT, HTTP_PORT_NAME, + HTTPS_PORT, HTTPS_PORT_NAME, JVM_CONFIG, JVM_SECURITY_PROPERTIES, LOG_PROPERTIES, + MAX_TRINO_LOG_FILES_SIZE, METRICS_PORT, METRICS_PORT_NAME, NODE_PROPERTIES, + RW_CONFIG_DIR_NAME, STACKABLE_CLIENT_TLS_DIR, STACKABLE_INTERNAL_TLS_DIR, + STACKABLE_MOUNT_INTERNAL_TLS_DIR, STACKABLE_MOUNT_SERVER_TLS_DIR, STACKABLE_SERVER_TLS_DIR, + TrinoRole, authentication::resolve_authentication_classes, catalog, discovery::{TrinoDiscovery, TrinoDiscoveryProtocol, TrinoPodRef}, + fault_tolerant_execution::ResolvedFaultTolerantExecutionConfig, rolegroup_headless_service_name, v1alpha1, }, listener::{ @@ -298,6 +300,11 @@ pub enum Error { source: crate::operations::graceful_shutdown::Error, }, + #[snafu(display("failed to configure fault tolerant execution"))] + FaultTolerantExecution { + source: crate::crd::fault_tolerant_execution::Error, + }, + #[snafu(display("failed to get required Labels"))] GetRequiredLabels { source: @@ -424,6 +431,20 @@ pub async fn reconcile_trino( catalogs.push(catalog_config); } + // Resolve fault tolerant execution configuration with S3 connections if needed + let resolved_fte_config = match trino.spec.cluster_config.fault_tolerant_execution.as_ref() { + Some(fte_config) => Some( + ResolvedFaultTolerantExecutionConfig::from_config( + fte_config, + Some(client), + &trino.namespace_r().context(ReadRoleSnafu)?, + ) + .await + .context(FaultTolerantExecutionSnafu)?, + ), + None => None, + }; + let validated_config = validated_product_config( trino, // The Trino version is a single number like 396. @@ -526,6 +547,7 @@ pub async fn reconcile_trino( &trino_authentication_config, &trino_opa_config, &client.kubernetes_cluster_info, + &resolved_fte_config, )?; let rg_catalog_configmap = build_rolegroup_catalog_config_map( trino, @@ -543,6 +565,7 @@ pub async fn reconcile_trino( &trino_authentication_config, &catalogs, &rbac_sa.name_any(), + &resolved_fte_config, )?; cluster_resources @@ -651,6 +674,7 @@ fn build_rolegroup_config_map( trino_authentication_config: &TrinoAuthenticationConfig, trino_opa_config: &Option, cluster_info: &KubernetesClusterInfo, + resolved_fte_config: &Option, ) -> Result { let mut cm_conf_data = BTreeMap::new(); @@ -712,6 +736,16 @@ fn build_rolegroup_config_map( dynamic_resolved_config .extend(graceful_shutdown_config_properties(trino, trino_role)); + // Add fault tolerant execution properties from resolved configuration + if let Some(resolved_fte) = resolved_fte_config { + dynamic_resolved_config.extend( + resolved_fte + .config_properties + .iter() + .map(|(k, v)| (k.clone(), Some(v.clone()))), + ); + } + // Add static properties and overrides dynamic_resolved_config.extend(transformed_config); @@ -776,6 +810,22 @@ fn build_rolegroup_config_map( cm_conf_data.insert(JVM_CONFIG.to_string(), jvm_config.to_string()); + // Add exchange manager properties from resolved fault tolerant execution configuration + if let Some(resolved_fte) = resolved_fte_config { + if !resolved_fte.exchange_manager_properties.is_empty() { + let exchange_props_with_options: BTreeMap> = resolved_fte + .exchange_manager_properties + .iter() + .map(|(k, v)| (k.clone(), Some(v.clone()))) + .collect(); + cm_conf_data.insert( + EXCHANGE_MANAGER_PROPERTIES.to_string(), + to_java_properties_string(exchange_props_with_options.iter()) + .with_context(|_| FailedToWriteJavaPropertiesSnafu)?, + ); + } + } + let jvm_sec_props: BTreeMap> = config .get(&PropertyNameKind::File(JVM_SECURITY_PROPERTIES.to_string())) .cloned() @@ -884,6 +934,7 @@ fn build_rolegroup_statefulset( trino_authentication_config: &TrinoAuthenticationConfig, catalogs: &[CatalogConfig], sa_name: &str, + resolved_fte_config: &Option, ) -> Result { let role = trino .role(trino_role) @@ -974,6 +1025,7 @@ fn build_rolegroup_statefulset( &mut cb_trino, catalogs, &requested_secret_lifetime, + resolved_fte_config, )?; let mut prepare_args = vec![]; @@ -992,6 +1044,7 @@ fn build_rolegroup_statefulset( trino, catalogs, merged_config, + resolved_fte_config, )); prepare_args @@ -1056,7 +1109,12 @@ fn build_rolegroup_statefulset( "-c".to_string(), ]) .args(vec![ - command::container_trino_args(trino_authentication_config, catalogs).join("\n"), + command::container_trino_args( + trino_authentication_config, + catalogs, + resolved_fte_config, + ) + .join("\n"), ]) .add_env_vars(env) .add_volume_mount("config", CONFIG_DIR_NAME) @@ -1532,6 +1590,7 @@ fn tls_volume_mounts( cb_trino: &mut ContainerBuilder, catalogs: &[CatalogConfig], requested_secret_lifetime: &Duration, + resolved_fte_config: &Option, ) -> Result<()> { if let Some(server_tls) = trino.get_server_tls() { cb_prepare @@ -1611,6 +1670,19 @@ fn tls_volume_mounts( .context(AddVolumeSnafu)?; } + // fault tolerant execution S3 credentials and other resources + if let Some(resolved_fte) = resolved_fte_config { + cb_prepare + .add_volume_mounts(resolved_fte.volume_mounts.clone()) + .context(AddVolumeMountSnafu)?; + cb_trino + .add_volume_mounts(resolved_fte.volume_mounts.clone()) + .context(AddVolumeMountSnafu)?; + pod_builder + .add_volumes(resolved_fte.volumes.clone()) + .context(AddVolumeSnafu)?; + } + Ok(()) } @@ -1780,6 +1852,7 @@ mod tests { &trino_authentication_config, &trino_opa_config, &cluster_info, + &None, ) .unwrap() } diff --git a/rust/operator-binary/src/crd/fault_tolerant_execution.rs b/rust/operator-binary/src/crd/fault_tolerant_execution.rs new file mode 100644 index 00000000..f39340a5 --- /dev/null +++ b/rust/operator-binary/src/crd/fault_tolerant_execution.rs @@ -0,0 +1,826 @@ +//! This module handles fault tolerant execution configuration for Trino. +//! +//! It processes the FaultTolerantExecutionConfig from the cluster configuration and +//! generates the appropriate properties for config.properties and exchange-manager.properties. +//! +//! Based on the Trino documentation: + +use std::collections::BTreeMap; + +use serde::{Deserialize, Serialize}; +use snafu::Snafu; +use stackable_operator::{ + builder::pod::volume::{SecretOperatorVolumeSourceBuilder, VolumeBuilder, VolumeMountBuilder}, + client::Client, + commons::tls_verification::{CaCert, TlsServerVerification, TlsVerification}, + crd::s3, + k8s_openapi::api::core::v1::{Volume, VolumeMount}, + schemars::{self, JsonSchema}, + time::Duration, +}; + +use super::catalog::commons::HdfsConnection; +use crate::{ + command, + crd::{CONFIG_DIR_NAME, STACKABLE_CLIENT_TLS_DIR}, +}; + +#[derive(Clone, Debug, Deserialize, Eq, JsonSchema, PartialEq, Serialize)] +#[serde(rename_all = "camelCase")] +pub struct FaultTolerantExecutionConfig { + /// The retry policy for fault tolerant execution. + /// `QUERY` retries entire queries, `TASK` retries individual tasks. + /// When set to `TASK`, an exchange manager must be configured. + pub retry_policy: RetryPolicy, + + /// Exchange manager configuration for spooling intermediate data during fault tolerant execution. + /// Required when using `TASK` retry policy, optional for `QUERY` retry policy. + #[serde(skip_serializing_if = "Option::is_none")] + pub exchange_manager: Option, + + /// Maximum number of times Trino may attempt to retry a query before declaring it failed. + /// Only applies to `QUERY` retry policy. + #[serde(skip_serializing_if = "Option::is_none")] + pub query_retry_attempts: Option, + + /// Maximum number of times Trino may attempt to retry a single task before declaring the query failed. + /// Only applies to `TASK` retry policy. + #[serde(skip_serializing_if = "Option::is_none")] + pub task_retry_attempts_per_task: Option, + + /// Minimum time that a failed query or task must wait before it is retried. + #[serde(skip_serializing_if = "Option::is_none")] + pub retry_initial_delay: Option, + + /// Maximum time that a failed query or task must wait before it is retried. + #[serde(skip_serializing_if = "Option::is_none")] + pub retry_max_delay: Option, + + /// Factor by which retry delay is increased on each query or task failure. + #[serde(skip_serializing_if = "Option::is_none")] + pub retry_delay_scale_factor: Option, + + /// Data size of the coordinator's in-memory buffer used to store output of query stages. + #[serde(skip_serializing_if = "Option::is_none")] + pub exchange_deduplication_buffer_size: Option, + + /// Whether to enable encryption of spooling data. + #[serde(skip_serializing_if = "Option::is_none")] + pub exchange_encryption_enabled: Option, +} + +#[derive(Clone, Debug, Deserialize, Eq, JsonSchema, PartialEq, Serialize)] +#[serde(rename_all = "SCREAMING_SNAKE_CASE")] +pub enum RetryPolicy { + /// Retry entire queries on failure + Query, + /// Retry individual tasks on failure (requires exchange manager) + Task, +} + +#[derive(Clone, Debug, Deserialize, Eq, JsonSchema, PartialEq, Serialize)] +#[serde(rename_all = "camelCase")] +pub struct ExchangeManagerConfig { + /// General exchange manager configuration that applies to all backends. + #[serde(flatten)] + pub general: ExchangeManagerGeneralConfig, + + /// Backend-specific configuration. + #[serde(flatten)] + pub backend: ExchangeManagerBackend, +} + +#[derive(Clone, Debug, Deserialize, Eq, JsonSchema, PartialEq, Serialize)] +#[serde(rename_all = "camelCase")] +pub struct ExchangeManagerGeneralConfig { + /// The minimum buffer pool size for an exchange sink. The larger the buffer pool size, + /// the larger the write parallelism and memory usage. + #[serde(skip_serializing_if = "Option::is_none")] + pub sink_buffer_pool_min_size: Option, + + /// The number of buffers per partition in the buffer pool. The larger the buffer pool size, + /// the larger the write parallelism and memory usage. + #[serde(skip_serializing_if = "Option::is_none")] + pub sink_buffers_per_partition: Option, + + /// Max data size of files written by exchange sinks. + #[serde(skip_serializing_if = "Option::is_none")] + pub sink_max_file_size: Option, + + /// Number of concurrent readers to read from spooling storage. The larger the number of + /// concurrent readers, the larger the read parallelism and memory usage. + #[serde(skip_serializing_if = "Option::is_none")] + pub source_concurrent_readers: Option, +} + +#[derive(Clone, Debug, Deserialize, Eq, JsonSchema, PartialEq, Serialize)] +#[serde(rename_all = "camelCase")] +pub enum ExchangeManagerBackend { + /// S3-compatible storage configuration (includes AWS S3, MinIO, GCS). + #[serde(rename = "s3")] + S3(S3ExchangeConfig), + /// Azure Blob Storage configuration. + #[serde(rename = "azure")] + Azure(AzureExchangeConfig), + /// HDFS-based exchange manager. + #[serde(rename = "hdfs")] + Hdfs(HdfsExchangeConfig), + /// Local filesystem storage (not recommended for production). + #[serde(rename = "local")] + Local(LocalExchangeConfig), +} + +#[derive(Clone, Debug, Deserialize, Eq, JsonSchema, PartialEq, Serialize)] +#[serde(rename_all = "camelCase")] +pub struct S3ExchangeConfig { + /// S3 bucket URIs for spooling data (e.g., s3://bucket1,s3://bucket2). + /// For GCS, use gs:// URIs (e.g., gs://bucket1,gs://bucket2). + pub base_directories: Vec, + /// S3 connection configuration. + /// Learn more about S3 configuration in the [S3 concept docs](DOCS_BASE_URL_PLACEHOLDER/concepts/s3). + pub connection: stackable_operator::crd::s3::v1alpha1::InlineConnectionOrReference, + /// IAM role to assume for S3 access. + #[serde(skip_serializing_if = "Option::is_none")] + pub iam_role: Option, + /// External ID for the IAM role trust policy. + #[serde(skip_serializing_if = "Option::is_none")] + pub external_id: Option, + /// Maximum number of times the S3 client should retry a request. + #[serde(skip_serializing_if = "Option::is_none")] + pub max_error_retries: Option, + /// Part data size for S3 multi-part upload. + #[serde(skip_serializing_if = "Option::is_none")] + pub upload_part_size: Option, + /// Google Cloud Storage service account key in JSON format. + /// Required when using GCS (gs:// URIs). Should contain the JSON service account key. + /// The operator will mount this as a file and configure `exchange.gcs.json-key-file-path`. + #[serde(skip_serializing_if = "Option::is_none")] + pub gcs_service_account_key: Option, +} + +#[derive(Clone, Debug, Deserialize, Eq, JsonSchema, PartialEq, Serialize)] +#[serde(rename_all = "camelCase")] +pub struct GcsServiceAccountKey { + /// [SecretClass](DOCS_BASE_URL_PLACEHOLDER/secret-operator/secretclass) providing the GCS service account key. + pub secret_class: String, + /// Key name in the Secret that contains the JSON service account key. + #[serde(skip_serializing_if = "Option::is_none")] + pub key: Option, +} + +#[derive(Clone, Debug, Deserialize, Eq, JsonSchema, PartialEq, Serialize)] +#[serde(rename_all = "camelCase")] +pub struct AzureExchangeConfig { + /// Azure Blob Storage container URIs for spooling data. + pub base_directories: Vec, + /// [SecretClass](DOCS_BASE_URL_PLACEHOLDER/secret-operator/secretclass) providing the Azure connection string. + pub secret_class: String, + /// Key name in the Secret that contains the connection string. + #[serde(skip_serializing_if = "Option::is_none")] + pub key: Option, + /// Azure blob endpoint URL (optional, used instead of connection string). + #[serde(skip_serializing_if = "Option::is_none")] + pub endpoint: Option, + /// Block data size for Azure block blob parallel upload. + #[serde(skip_serializing_if = "Option::is_none")] + pub block_size: Option, + /// Maximum number of times the Azure client should retry a request. + #[serde(skip_serializing_if = "Option::is_none")] + pub max_error_retries: Option, +} + +#[derive(Clone, Debug, Deserialize, Eq, JsonSchema, PartialEq, Serialize)] +#[serde(rename_all = "camelCase")] +pub struct HdfsExchangeConfig { + /// HDFS URIs for spooling data. + pub base_directories: Vec, + /// HDFS connection configuration. + pub hdfs: HdfsConnection, + /// Block data size for HDFS storage. + #[serde(skip_serializing_if = "Option::is_none")] + pub block_size: Option, + /// Skip directory scheme validation to support Hadoop-compatible file systems. + #[serde(skip_serializing_if = "Option::is_none")] + pub skip_directory_scheme_validation: Option, +} + +#[derive(Clone, Debug, Deserialize, Eq, JsonSchema, PartialEq, Serialize)] +#[serde(rename_all = "camelCase")] +pub struct LocalExchangeConfig { + /// Local filesystem paths for exchange storage. + pub base_directories: Vec, +} + +#[derive(Snafu, Debug)] +pub enum Error { + #[snafu(display("Exchange manager is required when using TASK retry policy"))] + ExchangeManagerRequiredForTaskPolicy, + + #[snafu(display("Failed to resolve S3 connection"))] + S3Connection { + source: s3::v1alpha1::ConnectionError, + }, + + #[snafu(display("trino does not support disabling the TLS verification of S3 servers"))] + S3TlsNoVerificationNotSupported, + + #[snafu(display("Failed to build Azure SecretClass volume"))] + AzureSecretClassVolumeBuild { + source: stackable_operator::builder::pod::volume::SecretOperatorVolumeSourceBuilderError, + }, +} + +/// Fault tolerant execution configuration with external resources resolved +pub struct ResolvedFaultTolerantExecutionConfig { + /// Properties to add to config.properties + pub config_properties: BTreeMap, + /// Properties to add to exchange-manager.properties (if needed) + pub exchange_manager_properties: BTreeMap, + /// Volumes required for the configuration (e.g., for S3 credentials) + pub volumes: Vec, + /// Volume mounts required for the configuration + pub volume_mounts: Vec, + /// Env-Vars that should be exported from files. + /// You can think of it like `export ="$(cat )"` + pub load_env_from_files: BTreeMap, + /// Additional commands that need to be executed before starting Trino + pub init_container_extra_start_commands: Vec, +} + +impl ResolvedFaultTolerantExecutionConfig { + /// Helper function to insert optional values into properties map + fn insert_if_present( + properties: &mut BTreeMap, + key: &str, + value: Option, + ) { + if let Some(v) = value { + properties.insert(key.to_string(), v.to_string()); + } + } + + /// Create a resolved fault tolerant execution configuration from the cluster config + pub async fn from_config( + config: &FaultTolerantExecutionConfig, + client: Option<&Client>, + namespace: &str, + ) -> Result { + if matches!(config.retry_policy, RetryPolicy::Task) && config.exchange_manager.is_none() { + return Err(Error::ExchangeManagerRequiredForTaskPolicy); + } + + let mut config_properties = BTreeMap::new(); + + let retry_policy = match config.retry_policy { + RetryPolicy::Query => "QUERY", + RetryPolicy::Task => "TASK", + }; + config_properties.insert("retry-policy".to_string(), retry_policy.to_string()); + + Self::insert_if_present( + &mut config_properties, + "query-retry-attempts", + config.query_retry_attempts, + ); + Self::insert_if_present( + &mut config_properties, + "task-retry-attempts-per-task", + config.task_retry_attempts_per_task, + ); + Self::insert_if_present( + &mut config_properties, + "retry-initial-delay", + config.retry_initial_delay.as_ref(), + ); + Self::insert_if_present( + &mut config_properties, + "retry-max-delay", + config.retry_max_delay.as_ref(), + ); + Self::insert_if_present( + &mut config_properties, + "retry-delay-scale-factor", + config.retry_delay_scale_factor.as_ref(), + ); + Self::insert_if_present( + &mut config_properties, + "exchange.deduplication-buffer-size", + config.exchange_deduplication_buffer_size.as_ref(), + ); + Self::insert_if_present( + &mut config_properties, + "fault-tolerant-execution.exchange-encryption-enabled", + config.exchange_encryption_enabled, + ); + + let mut exchange_manager_properties = BTreeMap::new(); + if let Some(exchange_config) = &config.exchange_manager { + // Add general properties + Self::insert_if_present( + &mut exchange_manager_properties, + "exchange.sink-buffer-pool-min-size", + exchange_config.general.sink_buffer_pool_min_size, + ); + Self::insert_if_present( + &mut exchange_manager_properties, + "exchange.sink-buffers-per-partition", + exchange_config.general.sink_buffers_per_partition, + ); + Self::insert_if_present( + &mut exchange_manager_properties, + "exchange.sink-max-file-size", + exchange_config.general.sink_max_file_size.as_ref(), + ); + Self::insert_if_present( + &mut exchange_manager_properties, + "exchange.source-concurrent-readers", + exchange_config.general.source_concurrent_readers, + ); + + // Add backend-specific configuration + match &exchange_config.backend { + ExchangeManagerBackend::S3(s3_config) => { + exchange_manager_properties.insert( + "exchange-manager.name".to_string(), + "filesystem".to_string(), + ); + exchange_manager_properties.insert( + "exchange.base-directories".to_string(), + s3_config.base_directories.join(","), + ); + + Self::insert_if_present( + &mut exchange_manager_properties, + "exchange.s3.iam-role", + s3_config.iam_role.as_ref(), + ); + Self::insert_if_present( + &mut exchange_manager_properties, + "exchange.s3.external-id", + s3_config.external_id.as_ref(), + ); + Self::insert_if_present( + &mut exchange_manager_properties, + "exchange.s3.max-error-retries", + s3_config.max_error_retries, + ); + Self::insert_if_present( + &mut exchange_manager_properties, + "exchange.s3.upload.part-size", + s3_config.upload_part_size.as_ref(), + ); + } + ExchangeManagerBackend::Azure(azure_config) => { + exchange_manager_properties.insert( + "exchange-manager.name".to_string(), + "filesystem".to_string(), + ); + exchange_manager_properties.insert( + "exchange.base-directories".to_string(), + azure_config.base_directories.join(","), + ); + + Self::insert_if_present( + &mut exchange_manager_properties, + "exchange.azure.endpoint", + azure_config.endpoint.as_ref(), + ); + Self::insert_if_present( + &mut exchange_manager_properties, + "exchange.azure.block-size", + azure_config.block_size.as_ref(), + ); + Self::insert_if_present( + &mut exchange_manager_properties, + "exchange.azure.max-error-retries", + azure_config.max_error_retries, + ); + } + ExchangeManagerBackend::Hdfs(hdfs_config) => { + exchange_manager_properties + .insert("exchange-manager.name".to_string(), "hdfs".to_string()); + exchange_manager_properties.insert( + "exchange.base-directories".to_string(), + hdfs_config.base_directories.join(","), + ); + + Self::insert_if_present( + &mut exchange_manager_properties, + "exchange.hdfs.block-size", + hdfs_config.block_size.as_ref(), + ); + Self::insert_if_present( + &mut exchange_manager_properties, + "exchange.hdfs.skip-directory-scheme-validation", + hdfs_config.skip_directory_scheme_validation, + ); + + let hdfs_config_dir = format!("{CONFIG_DIR_NAME}/exchange-hdfs-config"); + exchange_manager_properties.insert( + "hdfs.config.resources".to_string(), + format!("{hdfs_config_dir}/core-site.xml,{hdfs_config_dir}/hdfs-site.xml"), + ); + } + ExchangeManagerBackend::Local(local_config) => { + exchange_manager_properties.insert( + "exchange-manager.name".to_string(), + "filesystem".to_string(), + ); + exchange_manager_properties.insert( + "exchange.base-directories".to_string(), + local_config.base_directories.join(","), + ); + } + } + } + + let mut resolved_config = Self { + config_properties, + exchange_manager_properties, + volumes: Vec::new(), + volume_mounts: Vec::new(), + load_env_from_files: BTreeMap::new(), + init_container_extra_start_commands: Vec::new(), + }; + + // Resolve external resources if Kubernetes client is available + // This should always be the case, except for when this function is called during unit tests + if let (Some(client), Some(exchange_config)) = (client, &config.exchange_manager) { + match &exchange_config.backend { + ExchangeManagerBackend::S3(s3_config) => { + resolved_config + .resolve_s3_backend(s3_config, client, namespace) + .await?; + } + ExchangeManagerBackend::Azure(azure_config) => { + resolved_config.resolve_azure_backend(azure_config).await?; + } + ExchangeManagerBackend::Hdfs(hdfs_config) => { + resolved_config.resolve_hdfs_backend(hdfs_config); + } + ExchangeManagerBackend::Local(_) => { + // Local backend requires no external resource resolution + } + } + } + + Ok(resolved_config) + } + + async fn resolve_s3_backend( + &mut self, + s3_config: &S3ExchangeConfig, + client: &Client, + namespace: &str, + ) -> Result<(), Error> { + use snafu::ResultExt; + + let s3_connection = s3_config + .connection + .clone() + .resolve(client, namespace) + .await + .context(S3ConnectionSnafu)?; + + let (volumes, mounts) = s3_connection + .volumes_and_mounts() + .context(S3ConnectionSnafu)?; + self.volumes.extend(volumes); + self.volume_mounts.extend(mounts); + + self.exchange_manager_properties.insert( + "exchange.s3.region".to_string(), + s3_connection.region.name.clone(), + ); + self.exchange_manager_properties.insert( + "exchange.s3.endpoint".to_string(), + s3_connection + .endpoint() + .context(S3ConnectionSnafu)? + .to_string(), + ); + self.exchange_manager_properties.insert( + "exchange.s3.path-style-access".to_string(), + (s3_connection.access_style == s3::v1alpha1::S3AccessStyle::Path).to_string(), + ); + + if let Some((access_key_path, secret_key_path)) = s3_connection.credentials_mount_paths() { + let access_key_env = "EXCHANGE_S3_AWS_ACCESS_KEY".to_string(); + let secret_key_env = "EXCHANGE_S3_AWS_SECRET_KEY".to_string(); + + self.exchange_manager_properties.insert( + "exchange.s3.aws-access-key".to_string(), + format!("${{ENV:{access_key_env}}}"), + ); + self.exchange_manager_properties.insert( + "exchange.s3.aws-secret-key".to_string(), + format!("${{ENV:{secret_key_env}}}"), + ); + + self.load_env_from_files + .insert(access_key_env, access_key_path); + self.load_env_from_files + .insert(secret_key_env, secret_key_path); + } + + if let Some(tls) = s3_connection.tls.tls.as_ref() { + match &tls.verification { + TlsVerification::None {} => return S3TlsNoVerificationNotSupportedSnafu.fail(), + TlsVerification::Server(TlsServerVerification { + ca_cert: CaCert::WebPki {}, + }) => {} + TlsVerification::Server(TlsServerVerification { + ca_cert: CaCert::SecretClass(_), + }) => { + if let Some(ca_cert) = s3_connection.tls.tls_ca_cert_mount_path() { + self.init_container_extra_start_commands.extend( + command::add_cert_to_truststore( + &ca_cert, + STACKABLE_CLIENT_TLS_DIR, + "exchange-s3-ca-cert", + ), + ); + } + } + } + } + + if let Some(gcs_key_config) = &s3_config.gcs_service_account_key { + let gcs_secret_mount_dir = format!("{CONFIG_DIR_NAME}/exchange-gcs-key"); + let volume_name = "exchange-gcs-key".to_string(); + let default_key_name = "key.json".to_string(); + let key_name = gcs_key_config.key.as_ref().unwrap_or(&default_key_name); + + let secret_volume_source = + SecretOperatorVolumeSourceBuilder::new(&gcs_key_config.secret_class) + .build() + .context(AzureSecretClassVolumeBuildSnafu)?; + + self.volumes.push( + VolumeBuilder::new(&volume_name) + .ephemeral(secret_volume_source) + .build(), + ); + self.volume_mounts.push( + VolumeMountBuilder::new(&volume_name, &gcs_secret_mount_dir) + .read_only(true) + .build(), + ); + + let json_key_file_path = format!("{gcs_secret_mount_dir}/{key_name}"); + self.exchange_manager_properties.insert( + "exchange.gcs.json-key-file-path".to_string(), + json_key_file_path, + ); + } + + Ok(()) + } + + async fn resolve_azure_backend( + &mut self, + azure_config: &AzureExchangeConfig, + ) -> Result<(), Error> { + use snafu::ResultExt; + + let azure_secret_mount_dir = format!("{CONFIG_DIR_NAME}/exchange-azure-secret"); + let volume_name = "exchange-azure-secret".to_string(); + let default_key_name = "connectionString".to_string(); + let key_name = azure_config.key.as_ref().unwrap_or(&default_key_name); + + let secret_volume_source = + SecretOperatorVolumeSourceBuilder::new(&azure_config.secret_class) + .build() + .context(AzureSecretClassVolumeBuildSnafu)?; + + self.volumes.push( + VolumeBuilder::new(&volume_name) + .ephemeral(secret_volume_source) + .build(), + ); + self.volume_mounts.push( + VolumeMountBuilder::new(&volume_name, &azure_secret_mount_dir) + .read_only(true) + .build(), + ); + + let connection_string_env = "EXCHANGE_AZURE_CONNECTION_STRING".to_string(); + self.exchange_manager_properties.insert( + "exchange.azure.connection-string".to_string(), + format!("${{ENV:{connection_string_env}}}"), + ); + + let connection_string_path = format!("{azure_secret_mount_dir}/{key_name}"); + self.load_env_from_files + .insert(connection_string_env, connection_string_path); + + Ok(()) + } + + fn resolve_hdfs_backend(&mut self, hdfs_config: &HdfsExchangeConfig) { + let hdfs_config_dir = format!("{CONFIG_DIR_NAME}/exchange-hdfs-config"); + let volume_name = "exchange-hdfs-config".to_string(); + + self.volumes.push( + VolumeBuilder::new(&volume_name) + .with_config_map(&hdfs_config.hdfs.config_map) + .build(), + ); + self.volume_mounts + .push(VolumeMountBuilder::new(&volume_name, &hdfs_config_dir).build()); + } +} + +#[cfg(test)] +mod tests { + use stackable_operator::time::Duration; + + use super::*; + + #[tokio::test] + async fn test_query_retry_policy_without_exchange_manager() { + let config = FaultTolerantExecutionConfig { + retry_policy: RetryPolicy::Query, + exchange_manager: None, + query_retry_attempts: Some(5), + task_retry_attempts_per_task: None, + retry_initial_delay: Some(Duration::from_secs(15)), + retry_max_delay: Some(Duration::from_secs(90)), + retry_delay_scale_factor: Some("3.0".to_string()), + exchange_deduplication_buffer_size: Some("64MB".to_string()), + exchange_encryption_enabled: Some(false), + }; + + let fte_config = + ResolvedFaultTolerantExecutionConfig::from_config(&config, None, "default") + .await + .unwrap(); + + assert_eq!( + fte_config.config_properties.get("retry-policy"), + Some(&"QUERY".to_string()) + ); + assert_eq!( + fte_config.config_properties.get("query-retry-attempts"), + Some(&"5".to_string()) + ); + assert_eq!( + fte_config.config_properties.get("retry-initial-delay"), + Some(&"15s".to_string()) + ); + assert_eq!( + fte_config.config_properties.get("retry-max-delay"), + Some(&"1m30s".to_string()) + ); + assert_eq!( + fte_config.config_properties.get("retry-delay-scale-factor"), + Some(&"3.0".to_string()) + ); + assert_eq!( + fte_config + .config_properties + .get("exchange.deduplication-buffer-size"), + Some(&"64MB".to_string()) + ); + assert_eq!( + fte_config + .config_properties + .get("fault-tolerant-execution.exchange-encryption-enabled"), + Some(&"false".to_string()) + ); + assert!(fte_config.exchange_manager_properties.is_empty()); + } + + #[tokio::test] + async fn test_task_retry_policy_requires_exchange_manager() { + let config = FaultTolerantExecutionConfig { + retry_policy: RetryPolicy::Task, + exchange_manager: None, + query_retry_attempts: None, + task_retry_attempts_per_task: Some(3), + retry_initial_delay: None, + retry_max_delay: None, + retry_delay_scale_factor: None, + exchange_deduplication_buffer_size: None, + exchange_encryption_enabled: None, + }; + + let result = + ResolvedFaultTolerantExecutionConfig::from_config(&config, None, "default").await; + assert!(matches!( + result, + Err(Error::ExchangeManagerRequiredForTaskPolicy) + )); + } + + #[tokio::test] + async fn test_task_retry_policy_with_s3_exchange_manager() { + let config = FaultTolerantExecutionConfig { + retry_policy: RetryPolicy::Task, + exchange_manager: Some(ExchangeManagerConfig { + general: ExchangeManagerGeneralConfig { + sink_buffer_pool_min_size: Some(20), + sink_buffers_per_partition: Some(4), + sink_max_file_size: Some("2GB".to_string()), + source_concurrent_readers: Some(8), + }, + backend: ExchangeManagerBackend::S3(S3ExchangeConfig { + base_directories: vec!["s3://my-bucket/exchange".to_string()], + connection: stackable_operator::crd::s3::v1alpha1::InlineConnectionOrReference::Reference( + "test-s3-connection".to_string() + ), + iam_role: Some("arn:aws:iam::123456789012:role/TrinoRole".to_string()), + external_id: Some("external-id-123".to_string()), + max_error_retries: Some(5), + upload_part_size: Some("10MB".to_string()), + gcs_service_account_key: None, + }), + }), + query_retry_attempts: None, + task_retry_attempts_per_task: Some(2), + retry_initial_delay: None, + retry_max_delay: None, + retry_delay_scale_factor: None, + exchange_deduplication_buffer_size: None, + exchange_encryption_enabled: None, + }; + + let fte_config = + ResolvedFaultTolerantExecutionConfig::from_config(&config, None, "default") + .await + .unwrap(); + + assert_eq!( + fte_config.config_properties.get("retry-policy"), + Some(&"TASK".to_string()) + ); + assert_eq!( + fte_config + .config_properties + .get("task-retry-attempts-per-task"), + Some(&"2".to_string()) + ); + + assert_eq!( + fte_config + .exchange_manager_properties + .get("exchange-manager.name"), + Some(&"filesystem".to_string()) + ); + assert_eq!( + fte_config + .exchange_manager_properties + .get("exchange.base-directories"), + Some(&"s3://my-bucket/exchange".to_string()) + ); + + assert_eq!( + fte_config + .exchange_manager_properties + .get("exchange.s3.iam-role"), + Some(&"arn:aws:iam::123456789012:role/TrinoRole".to_string()) + ); + assert_eq!( + fte_config + .exchange_manager_properties + .get("exchange.s3.external-id"), + Some(&"external-id-123".to_string()) + ); + assert_eq!( + fte_config + .exchange_manager_properties + .get("exchange.s3.max-error-retries"), + Some(&"5".to_string()) + ); + assert_eq!( + fte_config + .exchange_manager_properties + .get("exchange.s3.upload.part-size"), + Some(&"10MB".to_string()) + ); + assert_eq!( + fte_config + .exchange_manager_properties + .get("exchange.sink-buffer-pool-min-size"), + Some(&"20".to_string()) + ); + assert_eq!( + fte_config + .exchange_manager_properties + .get("exchange.sink-buffers-per-partition"), + Some(&"4".to_string()) + ); + assert_eq!( + fte_config + .exchange_manager_properties + .get("exchange.sink-max-file-size"), + Some(&"2GB".to_string()) + ); + assert_eq!( + fte_config + .exchange_manager_properties + .get("exchange.source-concurrent-readers"), + Some(&"8".to_string()) + ); + } +} diff --git a/rust/operator-binary/src/crd/mod.rs b/rust/operator-binary/src/crd/mod.rs index 14fca469..9a73bca4 100644 --- a/rust/operator-binary/src/crd/mod.rs +++ b/rust/operator-binary/src/crd/mod.rs @@ -2,6 +2,7 @@ pub mod affinity; pub mod authentication; pub mod catalog; pub mod discovery; +pub mod fault_tolerant_execution; use std::{collections::BTreeMap, ops::Div, str::FromStr}; @@ -59,6 +60,7 @@ pub const NODE_PROPERTIES: &str = "node.properties"; pub const LOG_PROPERTIES: &str = "log.properties"; pub const ACCESS_CONTROL_PROPERTIES: &str = "access-control.properties"; pub const JVM_SECURITY_PROPERTIES: &str = "security.properties"; +pub const EXCHANGE_MANAGER_PROPERTIES: &str = "exchange-manager.properties"; // node.properties pub const NODE_ENVIRONMENT: &str = "node.environment"; // config.properties @@ -283,6 +285,12 @@ pub mod versioned { #[serde(default)] pub tls: TrinoTls, + /// Fault tolerant execution configuration. + /// When enabled, Trino can automatically retry queries or tasks in case of failures. + #[serde(skip_serializing_if = "Option::is_none")] + pub fault_tolerant_execution: + Option, + /// Name of the Vector aggregator [discovery ConfigMap](DOCS_BASE_URL_PLACEHOLDER/concepts/service_discovery). /// It must contain the key `ADDRESS` with the address of the Vector aggregator. /// Follow the [logging tutorial](DOCS_BASE_URL_PLACEHOLDER/tutorials/logging-vector-aggregator) From 96ebd558809741aaeb0ca20d7e281d76c35f757f Mon Sep 17 00:00:00 2001 From: dervoeti Date: Tue, 5 Aug 2025 19:21:50 +0200 Subject: [PATCH 03/26] test: fault-tolerant execution integration test --- .../fault-tolerant-execution/00-assert.yaml | 19 ++++ ...tor-aggregator-discovery-configmap.yaml.j2 | 9 ++ .../00-patch-ns.yaml.j2 | 9 ++ .../fault-tolerant-execution/00-rbac.yaml.j2 | 29 ++++++ .../fault-tolerant-execution/00-secrets.yaml | 62 +++++++++++++ .../fault-tolerant-execution/01-assert.yaml | 17 ++++ .../01-install-minio.yaml | 11 +++ .../fault-tolerant-execution/02-assert.yaml | 25 +++++ .../02-install-trino.yaml.j2 | 57 ++++++++++++ .../fault-tolerant-execution/03-assert.yaml | 12 +++ .../03-install-test-helper.yaml | 29 ++++++ .../04-copy-scripts.yaml | 5 + .../fault-tolerant-execution/05-assert.yaml | 20 ++++ .../05-run-tests.yaml | 16 ++++ .../fault-tolerant-execution/check-fte.py | 93 +++++++++++++++++++ .../helm-bitnami-minio-values.yaml | 54 +++++++++++ tests/test-definition.yaml | 4 + 17 files changed, 471 insertions(+) create mode 100644 tests/templates/kuttl/fault-tolerant-execution/00-assert.yaml create mode 100644 tests/templates/kuttl/fault-tolerant-execution/00-install-vector-aggregator-discovery-configmap.yaml.j2 create mode 100644 tests/templates/kuttl/fault-tolerant-execution/00-patch-ns.yaml.j2 create mode 100644 tests/templates/kuttl/fault-tolerant-execution/00-rbac.yaml.j2 create mode 100644 tests/templates/kuttl/fault-tolerant-execution/00-secrets.yaml create mode 100644 tests/templates/kuttl/fault-tolerant-execution/01-assert.yaml create mode 100644 tests/templates/kuttl/fault-tolerant-execution/01-install-minio.yaml create mode 100644 tests/templates/kuttl/fault-tolerant-execution/02-assert.yaml create mode 100644 tests/templates/kuttl/fault-tolerant-execution/02-install-trino.yaml.j2 create mode 100644 tests/templates/kuttl/fault-tolerant-execution/03-assert.yaml create mode 100644 tests/templates/kuttl/fault-tolerant-execution/03-install-test-helper.yaml create mode 100644 tests/templates/kuttl/fault-tolerant-execution/04-copy-scripts.yaml create mode 100644 tests/templates/kuttl/fault-tolerant-execution/05-assert.yaml create mode 100644 tests/templates/kuttl/fault-tolerant-execution/05-run-tests.yaml create mode 100644 tests/templates/kuttl/fault-tolerant-execution/check-fte.py create mode 100644 tests/templates/kuttl/fault-tolerant-execution/helm-bitnami-minio-values.yaml diff --git a/tests/templates/kuttl/fault-tolerant-execution/00-assert.yaml b/tests/templates/kuttl/fault-tolerant-execution/00-assert.yaml new file mode 100644 index 00000000..47bfe1ea --- /dev/null +++ b/tests/templates/kuttl/fault-tolerant-execution/00-assert.yaml @@ -0,0 +1,19 @@ +--- +apiVersion: kuttl.dev/v1beta1 +kind: TestAssert +timeout: 300 +--- +apiVersion: v1 +kind: Secret +metadata: + name: minio-credentials +--- +apiVersion: v1 +kind: Secret +metadata: + name: minio-tls-certificates +--- +apiVersion: s3.stackable.tech/v1alpha1 +kind: S3Connection +metadata: + name: minio diff --git a/tests/templates/kuttl/fault-tolerant-execution/00-install-vector-aggregator-discovery-configmap.yaml.j2 b/tests/templates/kuttl/fault-tolerant-execution/00-install-vector-aggregator-discovery-configmap.yaml.j2 new file mode 100644 index 00000000..2d6a0df5 --- /dev/null +++ b/tests/templates/kuttl/fault-tolerant-execution/00-install-vector-aggregator-discovery-configmap.yaml.j2 @@ -0,0 +1,9 @@ +{% if lookup('env', 'VECTOR_AGGREGATOR') %} +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: vector-aggregator-discovery +data: + ADDRESS: {{ lookup('env', 'VECTOR_AGGREGATOR') }} +{% endif %} diff --git a/tests/templates/kuttl/fault-tolerant-execution/00-patch-ns.yaml.j2 b/tests/templates/kuttl/fault-tolerant-execution/00-patch-ns.yaml.j2 new file mode 100644 index 00000000..67185acf --- /dev/null +++ b/tests/templates/kuttl/fault-tolerant-execution/00-patch-ns.yaml.j2 @@ -0,0 +1,9 @@ +{% if test_scenario['values']['openshift'] == 'true' %} +# see https://github.com/stackabletech/issues/issues/566 +--- +apiVersion: kuttl.dev/v1beta1 +kind: TestStep +commands: + - script: kubectl patch namespace $NAMESPACE -p '{"metadata":{"labels":{"pod-security.kubernetes.io/enforce":"privileged"}}}' + timeout: 120 +{% endif %} diff --git a/tests/templates/kuttl/fault-tolerant-execution/00-rbac.yaml.j2 b/tests/templates/kuttl/fault-tolerant-execution/00-rbac.yaml.j2 new file mode 100644 index 00000000..9cbf0351 --- /dev/null +++ b/tests/templates/kuttl/fault-tolerant-execution/00-rbac.yaml.j2 @@ -0,0 +1,29 @@ +--- +kind: Role +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: use-integration-tests-scc +rules: +{% if test_scenario['values']['openshift'] == "true" %} + - apiGroups: ["security.openshift.io"] + resources: ["securitycontextconstraints"] + resourceNames: ["privileged"] + verbs: ["use"] +{% endif %} +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: integration-tests-sa +--- +kind: RoleBinding +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: use-integration-tests-scc +subjects: + - kind: ServiceAccount + name: integration-tests-sa +roleRef: + kind: Role + name: use-integration-tests-scc + apiGroup: rbac.authorization.k8s.io diff --git a/tests/templates/kuttl/fault-tolerant-execution/00-secrets.yaml b/tests/templates/kuttl/fault-tolerant-execution/00-secrets.yaml new file mode 100644 index 00000000..b9dd795f --- /dev/null +++ b/tests/templates/kuttl/fault-tolerant-execution/00-secrets.yaml @@ -0,0 +1,62 @@ +--- +apiVersion: v1 +kind: Secret +metadata: + name: minio-credentials + labels: + secrets.stackable.tech/class: s3-credentials-class +stringData: + accessKey: minioAccessKey + secretKey: minioSecretKey + # The following two entries are used by the Bitnami chart for MinIO to + # set up credentials for accessing buckets managed by the MinIO tenant. + root-user: minioAccessKey + root-password: minioSecretKey +--- +apiVersion: secrets.stackable.tech/v1alpha1 +kind: SecretClass +metadata: + name: s3-credentials-class +spec: + backend: + k8sSearch: + searchNamespace: + pod: {} +--- +apiVersion: secrets.stackable.tech/v1alpha1 +kind: SecretClass +metadata: + name: minio-tls-certificates +spec: + backend: + k8sSearch: + searchNamespace: + pod: {} +--- +apiVersion: v1 +kind: Secret +metadata: + name: minio-tls-certificates + labels: + secrets.stackable.tech/class: minio-tls-certificates +# Have a look at the folder certs on how to create this +data: + ca.crt: LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSUQyVENDQXNHZ0F3SUJBZ0lVTmpxdUdZV3R5SjVhNnd5MjNIejJHUmNNbHdNd0RRWUpLb1pJaHZjTkFRRUwKQlFBd2V6RUxNQWtHQTFVRUJoTUNSRVV4R3pBWkJnTlZCQWdNRWxOamFHeGxjM2RwWnkxSWIyeHpkR1ZwYmpFTwpNQXdHQTFVRUJ3d0ZWMlZrWld3eEtEQW1CZ05WQkFvTUgxTjBZV05yWVdKc1pTQlRhV2R1YVc1bklFRjFkR2h2CmNtbDBlU0JKYm1NeEZUQVRCZ05WQkFNTURITjBZV05yWVdKc1pTNWtaVEFnRncweU16QTJNVFl4TWpVeE1ESmEKR0E4eU1USXpNRFV5TXpFeU5URXdNbG93ZXpFTE1Ba0dBMVVFQmhNQ1JFVXhHekFaQmdOVkJBZ01FbE5qYUd4bApjM2RwWnkxSWIyeHpkR1ZwYmpFT01Bd0dBMVVFQnd3RlYyVmtaV3d4S0RBbUJnTlZCQW9NSDFOMFlXTnJZV0pzClpTQlRhV2R1YVc1bklFRjFkR2h2Y21sMGVTQkpibU14RlRBVEJnTlZCQU1NREhOMFlXTnJZV0pzWlM1a1pUQ0MKQVNJd0RRWUpLb1pJaHZjTkFRRUJCUUFEZ2dFUEFEQ0NBUW9DZ2dFQkFOblYvdmJ5M1JvNTdhMnF2UVJubjBqZQplS01VMitGMCtsWk5DQXZpR1VENWJtOGprOTFvUFpuazBiaFFxZXlFcm1EUzRXVDB6ZXZFUklCSkpEamZMMEQ4CjQ2QmU3UGlNS2UwZEdqb3FJM3o1Y09JZWpjOGFMUEhTSWxnTjZsVDNmSXJ1UzE2Y29RZ0c0dWFLaUhGNStlV0YKRFJVTGR1NmRzWXV6NmRLanFSaVVPaEh3RHd0VUprRHdQditFSXRxbzBIK01MRkxMWU0wK2xFSWFlN2RONUNRNQpTbzVXaEwyY3l2NVZKN2xqL0VBS0NWaUlFZ0NtekRSRGNSZ1NTald5SDRibjZ5WDIwMjZmUEl5V0pGeUVkTC82CmpBT0pBRERSMEd5aE5PWHJFZXFob2NTTW5JYlFWcXdBVDBrTWh1WFN2d3Zscm5MeVRwRzVqWm00bFVNMzRrTUMKQXdFQUFhTlRNRkV3SFFZRFZSME9CQllFRkVJM1JNTWl5aUJqeVExUlM4bmxPUkpWZDFwQk1COEdBMVVkSXdRWQpNQmFBRkVJM1JNTWl5aUJqeVExUlM4bmxPUkpWZDFwQk1BOEdBMVVkRXdFQi93UUZNQU1CQWY4d0RRWUpLb1pJCmh2Y05BUUVMQlFBRGdnRUJBSHRLUlhkRmR0VWh0VWpvZG1ZUWNlZEFEaEhaT2hCcEtpbnpvdTRicmRrNEhmaEYKTHIvV0ZsY1JlbWxWNm1Cc0xweU11SytUZDhaVUVRNkpFUkx5NmxTL2M2cE9HeG5CNGFDbEU4YXQrQytUakpBTwpWbTNXU0k2VlIxY0ZYR2VaamxkVlE2eGtRc2tNSnpPN2RmNmlNVFB0VjVSa01lSlh0TDZYYW1FaTU0ckJvZ05ICk5yYStFSkJRQmwvWmU5ME5qZVlidjIwdVFwWmFhWkZhYVNtVm9OSERwQndsYTBvdXkrTWpPYkMzU3BnT3ExSUMKUGwzTnV3TkxWOFZiT3I1SHJoUUFvS21nU05iM1A4dmFUVnV4L1gwWWZqeS9TN045a1BCYUs5bUZqNzR6d1Y5dwpxU1ExNEtsNWpPM1YzaHJHV1laRWpET2diWnJyRVgxS1hFdXN0K1E9Ci0tLS0tRU5EIENFUlRJRklDQVRFLS0tLS0K + tls.crt: LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSUR5RENDQXJDZ0F3SUJBZ0lVQ0kyUE5OcnR6cDZRbDdHa3VhRnhtRGE2VUJvd0RRWUpLb1pJaHZjTkFRRUwKQlFBd2V6RUxNQWtHQTFVRUJoTUNSRVV4R3pBWkJnTlZCQWdNRWxOamFHeGxjM2RwWnkxSWIyeHpkR1ZwYmpFTwpNQXdHQTFVRUJ3d0ZWMlZrWld3eEtEQW1CZ05WQkFvTUgxTjBZV05yWVdKc1pTQlRhV2R1YVc1bklFRjFkR2h2CmNtbDBlU0JKYm1NeEZUQVRCZ05WQkFNTURITjBZV05yWVdKc1pTNWtaVEFnRncweU16QTJNVFl4TWpVeE1ESmEKR0E4eU1USXpNRFV5TXpFeU5URXdNbG93WGpFTE1Ba0dBMVVFQmhNQ1JFVXhHekFaQmdOVkJBZ01FbE5qYUd4bApjM2RwWnkxSWIyeHpkR1ZwYmpFT01Bd0dBMVVFQnd3RlYyVmtaV3d4RWpBUUJnTlZCQW9NQ1ZOMFlXTnJZV0pzClpURU9NQXdHQTFVRUF3d0ZiV2x1YVc4d2dnRWlNQTBHQ1NxR1NJYjNEUUVCQVFVQUE0SUJEd0F3Z2dFS0FvSUIKQVFDanluVnorWEhCOE9DWTRwc0VFWW1qb2JwZHpUbG93d2NTUU4rWURQQ2tCZW9yMFRiODdFZ0x6SksrSllidQpwb1hCbE5JSlBRYW93SkVvL1N6U2s4ZnUyWFNNeXZBWlk0RldHeEp5Mnl4SXh2UC9pYk9HT1l1aVBHWEsyNHQ2ClpjR1RVVmhhdWlaR1Nna1dyZWpXV2g3TWpGUytjMXZhWVpxQitRMXpQczVQRk1sYzhsNVYvK2I4WjdqTUppODQKbU9mSVB4amt2SXlKcjVVa2VGM1VmTHFKUzV5NExGNHR5NEZ0MmlBZDdiYmZIYW5mdlltdjZVb0RWdE1YdFdvMQpvUVBmdjNzaFdybVJMenc2ZXVJQXRiWGM1Q2pCeUlha0NiaURuQVU4cktnK0IxSjRtdlFnckx3bzNxUHJ5Smd4ClNkaWRtWjJtRVI3RXorYzVCMG0vTGlJaEFnTUJBQUdqWHpCZE1Cc0dBMVVkRVFRVU1CS0NCVzFwYm1sdmdnbHMKYjJOaGJHaHZjM1F3SFFZRFZSME9CQllFRkpRMGdENWtFdFFyK3REcERTWjdrd1o4SDVoR01COEdBMVVkSXdRWQpNQmFBRkVJM1JNTWl5aUJqeVExUlM4bmxPUkpWZDFwQk1BMEdDU3FHU0liM0RRRUJDd1VBQTRJQkFRQmNkaGQrClI0Sm9HdnFMQms1OWRxSVVlY2N0dUZzcmRQeHNCaU9GaFlOZ1pxZWRMTTBVTDVEenlmQUhmVk8wTGZTRURkZFgKUkpMOXlMNytrTVUwVDc2Y3ZkQzlYVkFJRTZIVXdUbzlHWXNQcXN1eVpvVmpOcEVESkN3WTNDdm9ubEpWZTRkcQovZ0FiSk1ZQitUU21ZNXlEUHovSkZZL1haellhUGI3T2RlR3VqYlZUNUl4cDk3QXBTOFlJaXY3M0Mwd1ViYzZSCmgwcmNmUmJ5a1NRVWg5dmdWZFhSU1I4RFQzV0NmZHFOek5CWVh2OW1xZlc1ejRzYkdqK2wzd1VsL0kzRi9tSXcKZnlPNEN0aTRha2lHVkhsZmZFeTB3a3pWYUJ4aGNYajJJM0JVVGhCNFpxamxzc2llVmFGa3d2WG1teVJUMG9FVwo1SCtOUEhjcXVTMXpQc2NsCi0tLS0tRU5EIENFUlRJRklDQVRFLS0tLS0K + tls.key: LS0tLS1CRUdJTiBQUklWQVRFIEtFWS0tLS0tCk1JSUV2QUlCQURBTkJna3Foa2lHOXcwQkFRRUZBQVNDQktZd2dnU2lBZ0VBQW9JQkFRQ2p5blZ6K1hIQjhPQ1kKNHBzRUVZbWpvYnBkelRsb3d3Y1NRTitZRFBDa0Jlb3IwVGI4N0VnTHpKSytKWWJ1cG9YQmxOSUpQUWFvd0pFbwovU3pTazhmdTJYU015dkFaWTRGV0d4SnkyeXhJeHZQL2liT0dPWXVpUEdYSzI0dDZaY0dUVVZoYXVpWkdTZ2tXCnJlaldXaDdNakZTK2MxdmFZWnFCK1ExelBzNVBGTWxjOGw1Vi8rYjhaN2pNSmk4NG1PZklQeGprdkl5SnI1VWsKZUYzVWZMcUpTNXk0TEY0dHk0RnQyaUFkN2JiZkhhbmZ2WW12NlVvRFZ0TVh0V28xb1FQZnYzc2hXcm1STHp3NgpldUlBdGJYYzVDakJ5SWFrQ2JpRG5BVThyS2crQjFKNG12UWdyTHdvM3FQcnlKZ3hTZGlkbVoybUVSN0V6K2M1CkIwbS9MaUloQWdNQkFBRUNnZ0VBQWQzdDVzdUNFMjdXY0llc3NxZ3NoSFAwZHRzKyswVzF6K3h6WC8xTnhPRFkKWVhWNkJmbi9mRHJ4dFQ4aVFaZ2VVQzJORTFQaHZveXJXdWMvMm9xYXJjdEd1OUFZV29HNjJLdG9VMnpTSFdZLwpJN3VERTFXV2xOdlJZVFdOYW5DOGV4eGpRRzE4d0RKWjFpdFhTeEl0NWJEM3lrL3dUUlh0dCt1SnpyVjVqb2N1CmNoeERMd293aXUxQWo2ZFJDWk5CejlUSnh5TnI1ME5ZVzJVWEJhVC84N1hyRkZkSndNVFZUMEI3SE9uRzdSQlYKUWxLdzhtcVZiYU5lbmhjdk1qUjI5c3hUekhSK2p4SU8zQndPNk9Hai9PRmhGQllVN1RMWGVsZDFxb2UwdmIyRwpiOGhQcEd1cHRyNUF0OWx3MXc1d1EzSWdpdXRQTkg1cXlEeUNwRWw2RVFLQmdRRGNkYnNsT2ZLSmo3TzJMQXlZCkZ0a1RwaWxFMFYzajBxbVE5M0lqclY0K0RSbUxNRUIyOTk0MDdCVVlRUWoxL0RJYlFjb1oyRUVjVUI1cGRlSHMKN0RNRUQ2WExIYjJKVTEyK2E3c1d5Q05kS2VjZStUNy9JYmxJOFR0MzQwVWxIUTZ6U01TRGNqdmZjRkhWZ3YwcwpDYWpoRng3TmtMRVhUWnI4ZlQzWUloajR2UUtCZ1FDK01nWjFVbW9KdzlJQVFqMnVJVTVDeTl4aldlWURUQU8vCllhWEl6d2xnZTQzOE1jYmI0Y04yU2FOU0dEZ1Y3bnU1a3FpaWhwalBZV0lpaU9CcDlrVFJIWE9kUFc0N3N5ZUkKdDNrd3JwMnpWbFVnbGNNWlo2bW1WM1FWYUFOWmdqVTRSU3Y0ZS9WeFVMamJaYWZqUHRaUnNqWkdwSzBZVTFvdApWajhJZVE3Zk5RS0JnQ1ArWk11ekpsSW5VQ1FTRlF4UHpxbFNtN0pNckpPaHRXV2h3TlRxWFZTc050dHV5VmVqCktIaGpneDR1b0JQcFZSVDJMTlVEWmI0RnByRjVPYVhBK3FOVEdyS0s3SU1iUlZidHArSVVVeEhHNGFGQStIUVgKUVhVVFRhNUpRT1RLVmJnWHpWM1lyTVhTUk1valZNcDMyVWJHeTVTc1p2MXpBamJ2QzhYWjYxSFJBb0dBZEJjUQp2aGU1eFpBUzVEbUtjSGkvemlHa3ViZXJuNk9NUGdxYUtJSEdsVytVOExScFR0ajBkNFRtL1Rydk1PUEovVEU1CllVcUtoenBIcmhDaCtjdHBvY0k2U1dXdm5SenpLbzNpbVFaY0Y1VEFqUTBjY3F0RmI5UzlkRHR5bi9YTUNqYWUKYWlNdll5VUVVRll5TFpDelBGWnNycDNoVVpHKzN5RmZoQXB3TzJrQ2dZQkh3WWFQSWRXNld3NytCMmhpbjBvdwpqYTNjZXN2QTRqYU1Qd1NMVDhPTnRVMUdCU01md2N6TWJuUEhMclJ2Qjg3bjlnUGFSMndRR1VtckZFTzNMUFgvCmtSY09HcFlCSHBEWEVqRGhLa1dkUnVMT0ZnNEhMWmRWOEFOWmxRMFZTY0U4dTNkRERVTzg5cEdEbjA4cVRBcmwKeDlreHN1ZEVWcmtlclpiNVV4RlZxUT09Ci0tLS0tRU5EIFBSSVZBVEUgS0VZLS0tLS0K +--- +apiVersion: s3.stackable.tech/v1alpha1 +kind: S3Connection +metadata: + name: minio +spec: + host: minio + port: 9000 + accessStyle: Path + credentials: + secretClass: s3-credentials-class + tls: + verification: + server: + caCert: + secretClass: minio-tls-certificates diff --git a/tests/templates/kuttl/fault-tolerant-execution/01-assert.yaml b/tests/templates/kuttl/fault-tolerant-execution/01-assert.yaml new file mode 100644 index 00000000..4d24ed7d --- /dev/null +++ b/tests/templates/kuttl/fault-tolerant-execution/01-assert.yaml @@ -0,0 +1,17 @@ +--- +apiVersion: kuttl.dev/v1beta1 +kind: TestAssert +timeout: 600 +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: minio +status: + readyReplicas: 1 + replicas: 1 +--- +apiVersion: v1 +kind: Service +metadata: + name: minio diff --git a/tests/templates/kuttl/fault-tolerant-execution/01-install-minio.yaml b/tests/templates/kuttl/fault-tolerant-execution/01-install-minio.yaml new file mode 100644 index 00000000..2247b8f1 --- /dev/null +++ b/tests/templates/kuttl/fault-tolerant-execution/01-install-minio.yaml @@ -0,0 +1,11 @@ +--- +apiVersion: kuttl.dev/v1beta1 +kind: TestStep +commands: + - script: >- + helm install minio + --namespace $NAMESPACE + --version 15.0.7 + -f helm-bitnami-minio-values.yaml + oci://registry-1.docker.io/bitnamicharts/minio + timeout: 240 diff --git a/tests/templates/kuttl/fault-tolerant-execution/02-assert.yaml b/tests/templates/kuttl/fault-tolerant-execution/02-assert.yaml new file mode 100644 index 00000000..d4947eeb --- /dev/null +++ b/tests/templates/kuttl/fault-tolerant-execution/02-assert.yaml @@ -0,0 +1,25 @@ +--- +apiVersion: kuttl.dev/v1beta1 +kind: TestAssert +timeout: 600 +--- +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: trino-fte-coordinator-default +status: + readyReplicas: 1 + replicas: 1 +--- +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: trino-fte-worker-default +status: + readyReplicas: 2 + replicas: 2 +--- +apiVersion: trino.stackable.tech/v1alpha1 +kind: TrinoCatalog +metadata: + name: tpch diff --git a/tests/templates/kuttl/fault-tolerant-execution/02-install-trino.yaml.j2 b/tests/templates/kuttl/fault-tolerant-execution/02-install-trino.yaml.j2 new file mode 100644 index 00000000..d08fad01 --- /dev/null +++ b/tests/templates/kuttl/fault-tolerant-execution/02-install-trino.yaml.j2 @@ -0,0 +1,57 @@ +--- +apiVersion: trino.stackable.tech/v1alpha1 +kind: TrinoCluster +metadata: + name: trino-fte +spec: + image: +{% if test_scenario['values']['trino'].find(",") > 0 %} + custom: "{{ test_scenario['values']['trino'].split(',')[1] }}" + productVersion: "{{ test_scenario['values']['trino'].split(',')[0] }}" +{% else %} + productVersion: "{{ test_scenario['values']['trino'] }}" +{% endif %} + pullPolicy: IfNotPresent + clusterConfig: + catalogLabelSelector: + matchLabels: + trino: trino-fte + # Fault tolerant execution with S3/MinIO exchange manager + faultTolerantExecution: + retryPolicy: TASK + exchangeManager: + s3: + baseDirectories: + - "s3://exchange-bucket/" + connection: + reference: "minio" +{% if lookup('env', 'VECTOR_AGGREGATOR') %} + vectorAggregatorConfigMapName: vector-aggregator-discovery +{% endif %} + coordinators: + config: + logging: + enableVectorAgent: {{ lookup('env', 'VECTOR_AGGREGATOR') | length > 0 }} + roleGroups: + default: + replicas: 1 + config: {} + workers: + config: + gracefulShutdownTimeout: 5s # Let the test run faster + logging: + enableVectorAgent: {{ lookup('env', 'VECTOR_AGGREGATOR') | length > 0 }} + roleGroups: + default: + replicas: 2 + config: {} +--- +apiVersion: trino.stackable.tech/v1alpha1 +kind: TrinoCatalog +metadata: + name: tpch + labels: + trino: trino-fte +spec: + connector: + tpch: {} diff --git a/tests/templates/kuttl/fault-tolerant-execution/03-assert.yaml b/tests/templates/kuttl/fault-tolerant-execution/03-assert.yaml new file mode 100644 index 00000000..168d5d8f --- /dev/null +++ b/tests/templates/kuttl/fault-tolerant-execution/03-assert.yaml @@ -0,0 +1,12 @@ +--- +apiVersion: kuttl.dev/v1beta1 +kind: TestAssert +timeout: 300 +--- +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: trino-test-helper +status: + readyReplicas: 1 + replicas: 1 diff --git a/tests/templates/kuttl/fault-tolerant-execution/03-install-test-helper.yaml b/tests/templates/kuttl/fault-tolerant-execution/03-install-test-helper.yaml new file mode 100644 index 00000000..4980bd6f --- /dev/null +++ b/tests/templates/kuttl/fault-tolerant-execution/03-install-test-helper.yaml @@ -0,0 +1,29 @@ +--- +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: trino-test-helper + labels: + app: trino-test-helper +spec: + replicas: 1 + selector: + matchLabels: + app: trino-test-helper + template: + metadata: + labels: + app: trino-test-helper + spec: + serviceAccount: integration-tests-sa + containers: + - name: trino-test-helper + image: oci.stackable.tech/sdp/testing-tools:0.2.0-stackable0.0.0-dev + command: ["sleep", "infinity"] + resources: + requests: + cpu: "250m" + memory: "64Mi" + limits: + cpu: "250m" + memory: "64Mi" diff --git a/tests/templates/kuttl/fault-tolerant-execution/04-copy-scripts.yaml b/tests/templates/kuttl/fault-tolerant-execution/04-copy-scripts.yaml new file mode 100644 index 00000000..aea8e8b6 --- /dev/null +++ b/tests/templates/kuttl/fault-tolerant-execution/04-copy-scripts.yaml @@ -0,0 +1,5 @@ +--- +apiVersion: kuttl.dev/v1beta1 +kind: TestStep +commands: + - script: kubectl cp -n $NAMESPACE check-fte.py trino-test-helper-0:/tmp/ diff --git a/tests/templates/kuttl/fault-tolerant-execution/05-assert.yaml b/tests/templates/kuttl/fault-tolerant-execution/05-assert.yaml new file mode 100644 index 00000000..615e91b2 --- /dev/null +++ b/tests/templates/kuttl/fault-tolerant-execution/05-assert.yaml @@ -0,0 +1,20 @@ +--- +apiVersion: kuttl.dev/v1beta1 +kind: TestAssert +timeout: 300 +--- +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: trino-fte-coordinator-default +status: + readyReplicas: 1 + replicas: 1 +--- +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: trino-fte-worker-default +status: + readyReplicas: 2 + replicas: 2 diff --git a/tests/templates/kuttl/fault-tolerant-execution/05-run-tests.yaml b/tests/templates/kuttl/fault-tolerant-execution/05-run-tests.yaml new file mode 100644 index 00000000..acab7b53 --- /dev/null +++ b/tests/templates/kuttl/fault-tolerant-execution/05-run-tests.yaml @@ -0,0 +1,16 @@ +--- +apiVersion: kuttl.dev/v1beta1 +kind: TestStep +commands: + - script: kubectl exec -n $NAMESPACE trino-test-helper-0 -- python /tmp/check-fte.py -c trino-fte-coordinator -w 2 + timeout: 120 + # Verify that the exchange bucket contains data + - script: | + count=$(kubectl exec -n $NAMESPACE deployment/minio -- mc stat local/exchange-bucket | awk '/Objects count:/ {print $3}') + if [ "$count" -gt 0 ]; then + echo "Objects count is $count (> 0)" + else + echo "Objects count is $count (not > 0)" + exit 1 + fi + timeout: 20 \ No newline at end of file diff --git a/tests/templates/kuttl/fault-tolerant-execution/check-fte.py b/tests/templates/kuttl/fault-tolerant-execution/check-fte.py new file mode 100644 index 00000000..8685cb1f --- /dev/null +++ b/tests/templates/kuttl/fault-tolerant-execution/check-fte.py @@ -0,0 +1,93 @@ +#!/usr/bin/env python +import trino +import argparse + +def get_connection(coordinator): + """Create anonymous connection for basic cluster health check""" + conn = trino.dbapi.connect( + host=coordinator, + port=8443, + user="test", + http_scheme="https", + verify=False, + session_properties={"query_max_execution_time": "60s"}, + ) + return conn + +if __name__ == "__main__": + # Construct an argument parser + all_args = argparse.ArgumentParser() + + # Add arguments to the parser + all_args.add_argument( + "-c", + "--coordinator", + required=True, + help="Trino Coordinator Host to connect to", + ) + all_args.add_argument( + "-w", + "--workers", + required=True, + help="Expected amount of workers to be present", + ) + + args = vars(all_args.parse_args()) + + expected_workers = args["workers"] + conn = get_connection(args["coordinator"]) + + try: + cursor = conn.cursor() + + # Check that workers are active + cursor.execute( + "SELECT COUNT(*) as nodes FROM system.runtime.nodes WHERE coordinator=false AND state='active'" + ) + (active_workers,) = cursor.fetchone() + + if int(active_workers) != int(expected_workers): + print( + "Mismatch: [expected/active] workers [" + + str(expected_workers) + + "/" + + str(active_workers) + + "]" + ) + exit(-1) + + print(f"Active workers check passed: {active_workers}/{expected_workers}") + + # Test that TPCH connector is working + cursor.execute("SELECT COUNT(*) FROM tpch.tiny.nation") + result = cursor.fetchone() + if result[0] != 25: # TPCH tiny.nation has 25 rows + print(f"TPCH test failed: expected 25 nations, got {result[0]}") + exit(-1) + + print("TPCH connector test passed") + + # Test a more complex query + cursor.execute(""" + SELECT + nation.name, + COUNT(*) AS num_cust + FROM + tpch.tiny.customer + JOIN + tpch.tiny.nation ON customer.nationkey = nation.nationkey + GROUP BY + nation.name + ORDER BY + num_cust DESC + """) + results = cursor.fetchall() + if len(results) == 0: + print("Complex query returned no results") + exit(-1) + + except Exception as e: + print(f"Test failed with error: {e}") + import traceback + traceback.print_exc() + exit(-1) diff --git a/tests/templates/kuttl/fault-tolerant-execution/helm-bitnami-minio-values.yaml b/tests/templates/kuttl/fault-tolerant-execution/helm-bitnami-minio-values.yaml new file mode 100644 index 00000000..81c01ac9 --- /dev/null +++ b/tests/templates/kuttl/fault-tolerant-execution/helm-bitnami-minio-values.yaml @@ -0,0 +1,54 @@ +--- +mode: standalone +disableWebUI: false +extraEnvVars: + - name: BITNAMI_DEBUG + value: "true" + - name: MINIO_LOG_LEVEL + value: DEBUG + +provisioning: + enabled: true + buckets: + - name: exchange-bucket + resources: + requests: + memory: 1Gi + cpu: "512m" + limits: + memory: "1Gi" + cpu: "1" + podSecurityContext: + enabled: false + containerSecurityContext: + enabled: false + +volumePermissions: + enabled: false + +podSecurityContext: + enabled: false + +containerSecurityContext: + enabled: false + +persistence: + enabled: false + +resources: + requests: + memory: 1Gi + cpu: "512m" + limits: + memory: "1Gi" + cpu: "1" + +auth: + existingSecret: minio-credentials + +service: + type: NodePort + +tls: + enabled: true + existingSecret: minio-tls-certificates diff --git a/tests/test-definition.yaml b/tests/test-definition.yaml index 92023474..6102c5ca 100644 --- a/tests/test-definition.yaml +++ b/tests/test-definition.yaml @@ -111,6 +111,10 @@ tests: - opa - keycloak - openshift + - name: fault-tolerant-execution + dimensions: + - trino + - openshift - name: listener dimensions: - trino From 04d31cb1845d019f6d1b8025c11325af7cfb8c82 Mon Sep 17 00:00:00 2001 From: dervoeti Date: Tue, 5 Aug 2025 19:33:51 +0200 Subject: [PATCH 04/26] docs: fault-tolerant execution documentation --- .../pages/usage-guide/configuration.adoc | 3 + .../usage-guide/fault-tolerant-execution.adoc | 288 ++++++++++++++++++ docs/modules/trino/partials/nav.adoc | 1 + ...rino-cluster-fault-tolerant-execution.yaml | 109 +++++++ 4 files changed, 401 insertions(+) create mode 100644 docs/modules/trino/pages/usage-guide/fault-tolerant-execution.adoc create mode 100644 examples/simple-trino-cluster-fault-tolerant-execution.yaml diff --git a/docs/modules/trino/pages/usage-guide/configuration.adoc b/docs/modules/trino/pages/usage-guide/configuration.adoc index 59ddc40b..fd7c805a 100644 --- a/docs/modules/trino/pages/usage-guide/configuration.adoc +++ b/docs/modules/trino/pages/usage-guide/configuration.adoc @@ -18,6 +18,9 @@ For a role or role group, at the same level of `config`, you can specify `config For a list of possible configuration properties consult the https://trino.io/docs/current/admin/properties.html[Trino Properties Reference]. +TIP: For fault-tolerant execution configuration, use the dedicated `faultTolerantExecution` section in the cluster configuration instead of `configOverrides`. +See xref:usage-guide/fault-tolerant-execution.adoc[] for detailed instructions. + [source,yaml] ---- workers: diff --git a/docs/modules/trino/pages/usage-guide/fault-tolerant-execution.adoc b/docs/modules/trino/pages/usage-guide/fault-tolerant-execution.adoc new file mode 100644 index 00000000..4ad80ed5 --- /dev/null +++ b/docs/modules/trino/pages/usage-guide/fault-tolerant-execution.adoc @@ -0,0 +1,288 @@ += Fault-tolerant execution +:description: Configure fault-tolerant execution in Trino clusters for improved query resilience and automatic retry capabilities. +:keywords: fault-tolerant execution, retry policy, exchange manager, spooling, query resilience + +Fault-tolerant execution is a mechanism in Trino that enables a cluster to mitigate query failures by retrying queries or their component tasks in the event of failure. +With fault-tolerant execution enabled, intermediate exchange data is spooled and can be re-used by another worker in the event of a worker outage or other fault during query execution. + +By default, if a Trino node lacks the resources to execute a task or otherwise fails during query execution, the query fails and must be run again manually. +The longer the runtime of a query, the more likely it is to be susceptible to such failures. + +NOTE: Fault tolerance does not apply to broken queries or other user error. +For example, Trino does not spend resources retrying a query that fails because its SQL cannot be parsed. + +Take a look at the link:https://trino.io/docs/current/admin/fault-tolerant-execution.html[Trino documentation for fault-tolerant execution {external-link-icon}^] to learn more. + +== Configuration + +Fault-tolerant execution is turned off by default. +To enable the feature, you need to configure it in your `TrinoCluster` resource by adding a `faultTolerantExecution` section to the cluster configuration: + +[source,yaml] +---- +spec: + clusterConfig: + faultTolerantExecution: + retryPolicy: QUERY # <1> + queryRetryAttempts: 3 # <2> +---- +<1> The retry policy - either `QUERY` or `TASK` +<2> Maximum number of times to retry a query (QUERY policy only) + +== Retry policies + +The `retryPolicy` configuration property designates whether Trino retries entire queries or a query's individual tasks in the event of failure. + +=== QUERY retry policy + +A `QUERY` retry policy instructs Trino to automatically retry a query in the event of an error occurring on a worker node. +A `QUERY` retry policy is recommended when the majority of the Trino cluster's workload consists of many small queries. + +By default, Trino does not implement fault tolerance for queries whose result set exceeds 32MB in size. +This limit can be increased by modifying the `exchangeDeduplicationBufferSize` configuration property to be greater than the default value of `32MB`, but this results in higher memory usage on the coordinator. + +[source,yaml] +---- +... +spec: + clusterConfig: + faultTolerantExecution: + retryPolicy: QUERY + queryRetryAttempts: 3 + exchangeDeduplicationBufferSize: 64MB # Increased from default 32MB +... +---- + +=== TASK retry policy + +A `TASK` retry policy instructs Trino to retry individual query tasks in the event of failure. +You **must** configure an exchange manager to use the task retry policy. +This policy is recommended when executing large batch queries, as the cluster can more efficiently retry smaller tasks within the query rather than retry the whole query. + +IMPORTANT: A `TASK` retry policy is best suited for long-running queries, but this policy can result in higher latency for short-running queries executed in high volume. +As a best practice, it is recommended to run a dedicated cluster with a `TASK` retry policy for large batch queries, separate from another cluster that handles short queries. + +[source,yaml] +---- +spec: + clusterConfig: + faultTolerantExecution: + retryPolicy: TASK + taskRetryAttemptsPerTask: 4 + exchangeManager: + s3: + baseDirectories: + - "s3://trino-exchange-bucket/spooling" + connection: + reference: my-s3-connection # <1> +---- +<1> Reference to an xref:concepts:s3.adoc[S3Connection] resource + +== Exchange manager + +Exchange spooling is responsible for storing and managing spooled data for fault-tolerant execution. +You can configure a filesystem-based exchange manager that stores spooled data in a specified location, such as AWS S3 and S3-compatible systems, Azure Blob Storage or HDFS. + +NOTE: An exchange manager is required when using the `TASK` retry policy and optional for the `QUERY` retry policy. + +=== S3-compatible storage + +You can use S3-compatible storage systems for exchange spooling, including AWS S3, MinIO, and Google Cloud Storage. + +[source,yaml] +---- +spec: + clusterConfig: + faultTolerantExecution: + retryPolicy: TASK + exchangeManager: + s3: + baseDirectories: # <1> + - "s3://exchange-bucket-1/trino-spooling" + connection: + reference: minio-s3-connection # <2> +--- +apiVersion: s3.stackable.tech/v1alpha1 +kind: S3Connection +metadata: + name: minio-s3-connection +spec: + host: minio.default.svc.cluster.local + port: 9000 + accessStyle: Path + credentials: + secretClass: minio-secret-class + tls: + verification: + server: + caCert: + secretClass: tls +---- +<1> Multiple S3 buckets can be specified to distribute I/O load +<2> S3 connection defined as a reference to an xref:concepts:s3.adoc[S3Connection] resource + +For Google Cloud Storage, you can use GCS buckets with S3 compatibility: + +[source,yaml] +---- +spec: + clusterConfig: + faultTolerantExecution: + exchangeManager: + s3: + baseDirectories: + - "gs://my-gcs-bucket/trino-spooling" + connection: + inline: + host: storage.googleapis.com + port: 443 + accessStyle: Path + credentials: + secretClass: gcs-hmac-credentials + tls: + verification: + server: + caCert: + webPki: {} + gcsServiceAccountKey: + secretClass: "gcs-service-account-secret-class" + key: "service-account.json" +---- + +=== Azure Blob Storage + +You can configure Azure Blob Storage as the exchange spooling destination: + +[source,yaml] +---- +spec: + clusterConfig: + faultTolerantExecution: + retryPolicy: TASK + exchangeManager: + azure: + baseDirectories: + - "abfs://exchange-container@mystorageaccount.dfs.core.windows.net/exchange-spooling" + secretClass: azure-credentials # <1> + key: connectionString # <2> +---- +<1> SecretClass providing the Azure connection string +<2> Key name in the Secret that contains the connection string (defaults to `connectionString`) + +The Azure connection string should be provided via a SecretClass that refers to a Kubernetes Secret containing the Azure storage account connection string, like this: + +[source,yaml] +---- +apiVersion: secrets.stackable.tech/v1alpha1 +kind: SecretClass +metadata: + name: azure-credentials +spec: + backend: + k8sSearch: + searchNamespace: + pod: {} +---- + +[source,yaml] +---- +apiVersion: v1 +kind: Secret +metadata: + name: azure-secret + labels: + secrets.stackable.tech/class: azure-credentials +type: Opaque +stringData: + connectionString: "DefaultEndpointsProtocol=https;AccountName=mystorageaccount;AccountKey=your_account_key;EndpointSuffix=core.windows.net" +---- + +=== HDFS storage + +You can configure HDFS as the exchange spooling destination: + +[source,yaml] +---- +spec: + clusterConfig: + faultTolerantExecution: + retryPolicy: TASK + exchangeManager: + hdfs: + baseDirectories: + - "hdfs://simple-hdfs/exchange-spooling" + hdfs: + configMap: simple-hdfs # <1> +---- +<1> ConfigMap containing HDFS configuration files (created by the HDFS operator) + +=== Local filesystem storage + +Local filesystem storage is supported but only recommended for development or single-node deployments: + +WARNING: It is only recommended to use a local filesystem for exchange in standalone, non-production clusters. +A local directory can only be used for exchange in a distributed cluster if the exchange directory is shared and accessible from all nodes. + +[source,yaml] +---- +spec: + clusterConfig: + faultTolerantExecution: + retryPolicy: TASK + exchangeManager: + local: + baseDirectories: + - "/trino-exchange" + coordinators: + roleGroups: + default: + replicas: 1 + podOverrides: + spec: + volumes: + - name: trino-exchange + persistentVolumeClaim: + claimName: trino-exchange-pvc + containers: + - name: trino + volumeMounts: + - name: trino-exchange + mountPath: /trino-exchange + workers: + roleGroups: + default: + replicas: 1 + podOverrides: + spec: + volumes: + - name: trino-exchange + persistentVolumeClaim: + claimName: trino-exchange-pvc + containers: + - name: trino + volumeMounts: + - name: trino-exchange + mountPath: /trino-exchange +--- +kind: PersistentVolumeClaim +apiVersion: v1 +metadata: + name: trino-exchange-pvc +spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 10Gi +---- + +== Connector support + +Support for fault-tolerant execution of SQL statements varies on a per-connector basis. +Take a look at the link:https://trino.io/docs/current/admin/fault-tolerant-execution.html#configuration[Trino documentation {external-link-icon}^] to see which connectors support fault-tolerant execution. + +When using connectors that do not explicitly support fault-tolerant execution, you may encounter a "This connector does not support query retries" error message. + +== Examples + +* link:https://github.com/stackabletech/trino-operator/blob/main/examples/simple-trino-cluster-fault-tolerant-execution.yaml[TrinoCluster with TASK retry policy and S3 exchange manager {external-link-icon}^] \ No newline at end of file diff --git a/docs/modules/trino/partials/nav.adoc b/docs/modules/trino/partials/nav.adoc index e0ec7aa0..a1ff7c0d 100644 --- a/docs/modules/trino/partials/nav.adoc +++ b/docs/modules/trino/partials/nav.adoc @@ -6,6 +6,7 @@ ** xref:trino:usage-guide/connect_to_trino.adoc[] ** xref:trino:usage-guide/listenerclass.adoc[] ** xref:trino:usage-guide/configuration.adoc[] +** xref:trino:usage-guide/fault-tolerant-execution.adoc[] ** xref:trino:usage-guide/s3.adoc[] ** xref:trino:usage-guide/security.adoc[] ** xref:trino:usage-guide/monitoring.adoc[] diff --git a/examples/simple-trino-cluster-fault-tolerant-execution.yaml b/examples/simple-trino-cluster-fault-tolerant-execution.yaml new file mode 100644 index 00000000..868b06e9 --- /dev/null +++ b/examples/simple-trino-cluster-fault-tolerant-execution.yaml @@ -0,0 +1,109 @@ +# stackablectl operator install commons secret listener trino +# helm install minio minio --repo https://charts.bitnami.com/bitnami --version 15.0.7 --set auth.rootUser=minio-access-key --set auth.rootPassword=minio-secret-key --set tls.enabled=true --set tls.existingSecret=minio-tls-certificates --set provisioning.enabled=true --set provisioning.buckets[0].name=trino-exchange-bucket + +apiVersion: trino.stackable.tech/v1alpha1 +kind: TrinoCluster +metadata: + name: trino-fault-tolerant +spec: + image: + productVersion: "476" + clusterConfig: + catalogLabelSelector: + matchLabels: + trino: trino-fault-tolerant + faultTolerantExecution: + retryPolicy: TASK + taskRetryAttemptsPerTask: 4 + retryInitialDelay: 10s + retryMaxDelay: 60s + retryDelayScaleFactor: "2.0" + exchangeDeduplicationBufferSize: 64MB + exchangeEncryptionEnabled: true + exchangeManager: + sinkBufferPoolMinSize: 20 + sinkBuffersPerPartition: 4 + sinkMaxFileSize: 2GB + sourceConcurrentReaders: 8 + s3: + baseDirectories: + - "s3://trino-exchange-bucket/spooling" + connection: + reference: minio-connection + maxErrorRetries: 10 + uploadPartSize: 10MB + coordinators: + roleGroups: + default: + replicas: 1 + workers: + roleGroups: + default: + replicas: 3 +--- +apiVersion: s3.stackable.tech/v1alpha1 +kind: S3Connection +metadata: + name: minio-connection +spec: + host: minio + port: 9000 + accessStyle: Path + credentials: + secretClass: minio-credentials + tls: + verification: + server: + caCert: + secretClass: minio-tls-certificates +--- +apiVersion: secrets.stackable.tech/v1alpha1 +kind: SecretClass +metadata: + name: minio-tls-certificates +spec: + backend: + k8sSearch: + searchNamespace: + pod: {} +--- +apiVersion: v1 +kind: Secret +metadata: + name: minio-tls-certificates + labels: + secrets.stackable.tech/class: minio-tls-certificates +data: + ca.crt: LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSUQyVENDQXNHZ0F3SUJBZ0lVTmpxdUdZV3R5SjVhNnd5MjNIejJHUmNNbHdNd0RRWUpLb1pJaHZjTkFRRUwKQlFBd2V6RUxNQWtHQTFVRUJoTUNSRVV4R3pBWkJnTlZCQWdNRWxOamFHeGxjM2RwWnkxSWIyeHpkR1ZwYmpFTwpNQXdHQTFVRUJ3d0ZWMlZrWld3eEtEQW1CZ05WQkFvTUgxTjBZV05yWVdKc1pTQlRhV2R1YVc1bklFRjFkR2h2CmNtbDBlU0JKYm1NeEZUQVRCZ05WQkFNTURITjBZV05yWVdKc1pTNWtaVEFnRncweU16QTJNVFl4TWpVeE1ESmEKR0E4eU1USXpNRFV5TXpFeU5URXdNbG93ZXpFTE1Ba0dBMVVFQmhNQ1JFVXhHekFaQmdOVkJBZ01FbE5qYUd4bApjM2RwWnkxSWIyeHpkR1ZwYmpFT01Bd0dBMVVFQnd3RlYyVmtaV3d4S0RBbUJnTlZCQW9NSDFOMFlXTnJZV0pzClpTQlRhV2R1YVc1bklFRjFkR2h2Y21sMGVTQkpibU14RlRBVEJnTlZCQU1NREhOMFlXTnJZV0pzWlM1a1pUQ0MKQVNJd0RRWUpLb1pJaHZjTkFRRUJCUUFEZ2dFUEFEQ0NBUW9DZ2dFQkFOblYvdmJ5M1JvNTdhMnF2UVJubjBqZQplS01VMitGMCtsWk5DQXZpR1VENWJtOGprOTFvUFpuazBiaFFxZXlFcm1EUzRXVDB6ZXZFUklCSkpEamZMMEQ4CjQ2QmU3UGlNS2UwZEdqb3FJM3o1Y09JZWpjOGFMUEhTSWxnTjZsVDNmSXJ1UzE2Y29RZ0c0dWFLaUhGNStlV0YKRFJVTGR1NmRzWXV6NmRLanFSaVVPaEh3RHd0VUprRHdQditFSXRxbzBIK01MRkxMWU0wK2xFSWFlN2RONUNRNQpTbzVXaEwyY3l2NVZKN2xqL0VBS0NWaUlFZ0NtekRSRGNSZ1NTald5SDRibjZ5WDIwMjZmUEl5V0pGeUVkTC82CmpBT0pBRERSMEd5aE5PWHJFZXFob2NTTW5JYlFWcXdBVDBrTWh1WFN2d3Zscm5MeVRwRzVqWm00bFVNMzRrTUMKQXdFQUFhTlRNRkV3SFFZRFZSME9CQllFRkVJM1JNTWl5aUJqeVExUlM4bmxPUkpWZDFwQk1COEdBMVVkSXdRWQpNQmFBRkVJM1JNTWl5aUJqeVExUlM4bmxPUkpWZDFwQk1BOEdBMVVkRXdFQi93UUZNQU1CQWY4d0RRWUpLb1pJCmh2Y05BUUVMQlFBRGdnRUJBSHRLUlhkRmR0VWh0VWpvZG1ZUWNlZEFEaEhaT2hCcEtpbnpvdTRicmRrNEhmaEYKTHIvV0ZsY1JlbWxWNm1Cc0xweU11SytUZDhaVUVRNkpFUkx5NmxTL2M2cE9HeG5CNGFDbEU4YXQrQytUakpBTwpWbTNXU0k2VlIxY0ZYR2VaamxkVlE2eGtRc2tNSnpPN2RmNmlNVFB0VjVSa01lSlh0TDZYYW1FaTU0ckJvZ05ICk5yYStFSkJRQmwvWmU5ME5qZVlidjIwdVFwWmFhWkZhYVNtVm9OSERwQndsYTBvdXkrTWpPYkMzU3BnT3ExSUMKUGwzTnV3TkxWOFZiT3I1SHJoUUFvS21nU05iM1A4dmFUVnV4L1gwWWZqeS9TN045a1BCYUs5bUZqNzR6d1Y5dwpxU1ExNEtsNWpPM1YzaHJHV1laRWpET2diWnJyRVgxS1hFdXN0K1E9Ci0tLS0tRU5EIENFUlRJRklDQVRFLS0tLS0K + tls.crt: LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSUR5RENDQXJDZ0F3SUJBZ0lVQ0kyUE5OcnR6cDZRbDdHa3VhRnhtRGE2VUJvd0RRWUpLb1pJaHZjTkFRRUwKQlFBd2V6RUxNQWtHQTFVRUJoTUNSRVV4R3pBWkJnTlZCQWdNRWxOamFHeGxjM2RwWnkxSWIyeHpkR1ZwYmpFTwpNQXdHQTFVRUJ3d0ZWMlZrWld3eEtEQW1CZ05WQkFvTUgxTjBZV05yWVdKc1pTQlRhV2R1YVc1bklFRjFkR2h2CmNtbDBlU0JKYm1NeEZUQVRCZ05WQkFNTURITjBZV05yWVdKc1pTNWtaVEFnRncweU16QTJNVFl4TWpVeE1ESmEKR0E4eU1USXpNRFV5TXpFeU5URXdNbG93WGpFTE1Ba0dBMVVFQmhNQ1JFVXhHekFaQmdOVkJBZ01FbE5qYUd4bApjM2RwWnkxSWIyeHpkR1ZwYmpFT01Bd0dBMVVFQnd3RlYyVmtaV3d4RWpBUUJnTlZCQW9NQ1ZOMFlXTnJZV0pzClpURU9NQXdHQTFVRUF3d0ZiV2x1YVc4d2dnRWlNQTBHQ1NxR1NJYjNEUUVCQVFVQUE0SUJEd0F3Z2dFS0FvSUIKQVFDanluVnorWEhCOE9DWTRwc0VFWW1qb2JwZHpUbG93d2NTUU4rWURQQ2tCZW9yMFRiODdFZ0x6SksrSllidQpwb1hCbE5JSlBRYW93SkVvL1N6U2s4ZnUyWFNNeXZBWlk0RldHeEp5Mnl4SXh2UC9pYk9HT1l1aVBHWEsyNHQ2ClpjR1RVVmhhdWlaR1Nna1dyZWpXV2g3TWpGUytjMXZhWVpxQitRMXpQczVQRk1sYzhsNVYvK2I4WjdqTUppODQKbU9mSVB4amt2SXlKcjVVa2VGM1VmTHFKUzV5NExGNHR5NEZ0MmlBZDdiYmZIYW5mdlltdjZVb0RWdE1YdFdvMQpvUVBmdjNzaFdybVJMenc2ZXVJQXRiWGM1Q2pCeUlha0NiaURuQVU4cktnK0IxSjRtdlFnckx3bzNxUHJ5Smd4ClNkaWRtWjJtRVI3RXorYzVCMG0vTGlJaEFnTUJBQUdqWHpCZE1Cc0dBMVVkRVFRVU1CS0NCVzFwYm1sdmdnbHMKYjJOaGJHaHZjM1F3SFFZRFZSME9CQllFRkpRMGdENWtFdFFyK3REcERTWjdrd1o4SDVoR01COEdBMVVkSXdRWQpNQmFBRkVJM1JNTWl5aUJqeVExUlM4bmxPUkpWZDFwQk1BMEdDU3FHU0liM0RRRUJDd1VBQTRJQkFRQmNkaGQrClI0Sm9HdnFMQms1OWRxSVVlY2N0dUZzcmRQeHNCaU9GaFlOZ1pxZWRMTTBVTDVEenlmQUhmVk8wTGZTRURkZFgKUkpMOXlMNytrTVUwVDc2Y3ZkQzlYVkFJRTZIVXdUbzlHWXNQcXN1eVpvVmpOcEVESkN3WTNDdm9ubEpWZTRkcQovZ0FiSk1ZQitUU21ZNXlEUHovSkZZL1haellhUGI3T2RlR3VqYlZUNUl4cDk3QXBTOFlJaXY3M0Mwd1ViYzZSCmgwcmNmUmJ5a1NRVWg5dmdWZFhSU1I4RFQzV0NmZHFOek5CWVh2OW1xZlc1ejRzYkdqK2wzd1VsL0kzRi9tSXcKZnlPNEN0aTRha2lHVkhsZmZFeTB3a3pWYUJ4aGNYajJJM0JVVGhCNFpxamxzc2llVmFGa3d2WG1teVJUMG9FVwo1SCtOUEhjcXVTMXpQc2NsCi0tLS0tRU5EIENFUlRJRklDQVRFLS0tLS0K + tls.key: LS0tLS1CRUdJTiBQUklWQVRFIEtFWS0tLS0tCk1JSUV2QUlCQURBTkJna3Foa2lHOXcwQkFRRUZBQVNDQktZd2dnU2lBZ0VBQW9JQkFRQ2p5blZ6K1hIQjhPQ1kKNHBzRUVZbWpvYnBkelRsb3d3Y1NRTitZRFBDa0Jlb3IwVGI4N0VnTHpKSytKWWJ1cG9YQmxOSUpQUWFvd0pFbwovU3pTazhmdTJYU015dkFaWTRGV0d4SnkyeXhJeHZQL2liT0dPWXVpUEdYSzI0dDZaY0dUVVZoYXVpWkdTZ2tXCnJlaldXaDdNakZTK2MxdmFZWnFCK1ExelBzNVBGTWxjOGw1Vi8rYjhaN2pNSmk4NG1PZklQeGprdkl5SnI1VWsKZUYzVWZMcUpTNXk0TEY0dHk0RnQyaUFkN2JiZkhhbmZ2WW12NlVvRFZ0TVh0V28xb1FQZnYzc2hXcm1STHp3NgpldUlBdGJYYzVDakJ5SWFrQ2JpRG5BVThyS2crQjFKNG12UWdyTHdvM3FQcnlKZ3hTZGlkbVoybUVSN0V6K2M1CkIwbS9MaUloQWdNQkFBRUNnZ0VBQWQzdDVzdUNFMjdXY0llc3NxZ3NoSFAwZHRzKyswVzF6K3h6WC8xTnhPRFkKWVhWNkJmbi9mRHJ4dFQ4aVFaZ2VVQzJORTFQaHZveXJXdWMvMm9xYXJjdEd1OUFZV29HNjJLdG9VMnpTSFdZLwpJN3VERTFXV2xOdlJZVFdOYW5DOGV4eGpRRzE4d0RKWjFpdFhTeEl0NWJEM3lrL3dUUlh0dCt1SnpyVjVqb2N1CmNoeERMd293aXUxQWo2ZFJDWk5CejlUSnh5TnI1ME5ZVzJVWEJhVC84N1hyRkZkSndNVFZUMEI3SE9uRzdSQlYKUWxLdzhtcVZiYU5lbmhjdk1qUjI5c3hUekhSK2p4SU8zQndPNk9Hai9PRmhGQllVN1RMWGVsZDFxb2UwdmIyRwpiOGhQcEd1cHRyNUF0OWx3MXc1d1EzSWdpdXRQTkg1cXlEeUNwRWw2RVFLQmdRRGNkYnNsT2ZLSmo3TzJMQXlZCkZ0a1RwaWxFMFYzajBxbVE5M0lqclY0K0RSbUxNRUIyOTk0MDdCVVlRUWoxL0RJYlFjb1oyRUVjVUI1cGRlSHMKN0RNRUQ2WExIYjJKVTEyK2E3c1d5Q05kS2VjZStUNy9JYmxJOFR0MzQwVWxIUTZ6U01TRGNqdmZjRkhWZ3YwcwpDYWpoRng3TmtMRVhUWnI4ZlQzWUloajR2UUtCZ1FDK01nWjFVbW9KdzlJQVFqMnVJVTVDeTl4aldlWURUQU8vCllhWEl6d2xnZTQzOE1jYmI0Y04yU2FOU0dEZ1Y3bnU1a3FpaWhwalBZV0lpaU9CcDlrVFJIWE9kUFc0N3N5ZUkKdDNrd3JwMnpWbFVnbGNNWlo2bW1WM1FWYUFOWmdqVTRSU3Y0ZS9WeFVMamJaYWZqUHRaUnNqWkdwSzBZVTFvdApWajhJZVE3Zk5RS0JnQ1ArWk11ekpsSW5VQ1FTRlF4UHpxbFNtN0pNckpPaHRXV2h3TlRxWFZTc050dHV5VmVqCktIaGpneDR1b0JQcFZSVDJMTlVEWmI0RnByRjVPYVhBK3FOVEdyS0s3SU1iUlZidHArSVVVeEhHNGFGQStIUVgKUVhVVFRhNUpRT1RLVmJnWHpWM1lyTVhTUk1valZNcDMyVWJHeTVTc1p2MXpBamJ2QzhYWjYxSFJBb0dBZEJjUQp2aGU1eFpBUzVEbUtjSGkvemlHa3ViZXJuNk9NUGdxYUtJSEdsVytVOExScFR0ajBkNFRtL1Rydk1PUEovVEU1CllVcUtoenBIcmhDaCtjdHBvY0k2U1dXdm5SenpLbzNpbVFaY0Y1VEFqUTBjY3F0RmI5UzlkRHR5bi9YTUNqYWUKYWlNdll5VUVVRll5TFpDelBGWnNycDNoVVpHKzN5RmZoQXB3TzJrQ2dZQkh3WWFQSWRXNld3NytCMmhpbjBvdwpqYTNjZXN2QTRqYU1Qd1NMVDhPTnRVMUdCU01md2N6TWJuUEhMclJ2Qjg3bjlnUGFSMndRR1VtckZFTzNMUFgvCmtSY09HcFlCSHBEWEVqRGhLa1dkUnVMT0ZnNEhMWmRWOEFOWmxRMFZTY0U4dTNkRERVTzg5cEdEbjA4cVRBcmwKeDlreHN1ZEVWcmtlclpiNVV4RlZxUT09Ci0tLS0tRU5EIFBSSVZBVEUgS0VZLS0tLS0K +--- +apiVersion: secrets.stackable.tech/v1alpha1 +kind: SecretClass +metadata: + name: minio-credentials +spec: + backend: + k8sSearch: + searchNamespace: + pod: {} +--- +apiVersion: v1 +kind: Secret +metadata: + name: minio-credentials-secret + labels: + secrets.stackable.tech/class: minio-credentials +stringData: + accessKey: minio-access-key + secretKey: minio-secret-key +--- +apiVersion: trino.stackable.tech/v1alpha1 +kind: TrinoCatalog +metadata: + name: tpch + labels: + trino: trino-fault-tolerant +spec: + connector: + tpch: {} \ No newline at end of file From 5a247ff335f452347ced139e8c41988b0c1014ea Mon Sep 17 00:00:00 2001 From: dervoeti Date: Tue, 5 Aug 2025 20:41:02 +0200 Subject: [PATCH 05/26] chore: changelog --- CHANGELOG.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0a870f76..b73fdf84 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,12 @@ All notable changes to this project will be documented in this file. ## [Unreleased] +### Added + +- Support for fault-tolerant execution ([#779]). + +[#779]: https://github.com/stackabletech/trino-operator/pull/779 + ## [25.7.0] - 2025-07-23 ## [25.7.0-rc1] - 2025-07-18 From 39eecfac9ba1cf1e22a682e8f330f744c477ebc1 Mon Sep 17 00:00:00 2001 From: dervoeti Date: Tue, 5 Aug 2025 21:25:49 +0200 Subject: [PATCH 06/26] fix: lint fixes --- .../trino/pages/usage-guide/fault-tolerant-execution.adoc | 2 +- examples/simple-trino-cluster-fault-tolerant-execution.yaml | 4 ++-- rust/operator-binary/src/controller.rs | 1 + .../kuttl/fault-tolerant-execution/05-run-tests.yaml | 2 +- tests/templates/kuttl/fault-tolerant-execution/check-fte.py | 3 +++ 5 files changed, 8 insertions(+), 4 deletions(-) diff --git a/docs/modules/trino/pages/usage-guide/fault-tolerant-execution.adoc b/docs/modules/trino/pages/usage-guide/fault-tolerant-execution.adoc index 4ad80ed5..55c38a29 100644 --- a/docs/modules/trino/pages/usage-guide/fault-tolerant-execution.adoc +++ b/docs/modules/trino/pages/usage-guide/fault-tolerant-execution.adoc @@ -285,4 +285,4 @@ When using connectors that do not explicitly support fault-tolerant execution, y == Examples -* link:https://github.com/stackabletech/trino-operator/blob/main/examples/simple-trino-cluster-fault-tolerant-execution.yaml[TrinoCluster with TASK retry policy and S3 exchange manager {external-link-icon}^] \ No newline at end of file +* link:https://github.com/stackabletech/trino-operator/blob/main/examples/simple-trino-cluster-fault-tolerant-execution.yaml[TrinoCluster with TASK retry policy and S3 exchange manager {external-link-icon}^] diff --git a/examples/simple-trino-cluster-fault-tolerant-execution.yaml b/examples/simple-trino-cluster-fault-tolerant-execution.yaml index 868b06e9..4e8d3f19 100644 --- a/examples/simple-trino-cluster-fault-tolerant-execution.yaml +++ b/examples/simple-trino-cluster-fault-tolerant-execution.yaml @@ -1,6 +1,6 @@ # stackablectl operator install commons secret listener trino # helm install minio minio --repo https://charts.bitnami.com/bitnami --version 15.0.7 --set auth.rootUser=minio-access-key --set auth.rootPassword=minio-secret-key --set tls.enabled=true --set tls.existingSecret=minio-tls-certificates --set provisioning.enabled=true --set provisioning.buckets[0].name=trino-exchange-bucket - +--- apiVersion: trino.stackable.tech/v1alpha1 kind: TrinoCluster metadata: @@ -106,4 +106,4 @@ metadata: trino: trino-fault-tolerant spec: connector: - tpch: {} \ No newline at end of file + tpch: {} diff --git a/rust/operator-binary/src/controller.rs b/rust/operator-binary/src/controller.rs index c30b1b3b..54e1400e 100644 --- a/rust/operator-binary/src/controller.rs +++ b/rust/operator-binary/src/controller.rs @@ -1582,6 +1582,7 @@ fn create_tls_volume( .build()) } +#[allow(clippy::too_many_arguments)] fn tls_volume_mounts( trino: &v1alpha1::TrinoCluster, trino_role: &TrinoRole, diff --git a/tests/templates/kuttl/fault-tolerant-execution/05-run-tests.yaml b/tests/templates/kuttl/fault-tolerant-execution/05-run-tests.yaml index acab7b53..988161c7 100644 --- a/tests/templates/kuttl/fault-tolerant-execution/05-run-tests.yaml +++ b/tests/templates/kuttl/fault-tolerant-execution/05-run-tests.yaml @@ -13,4 +13,4 @@ commands: echo "Objects count is $count (not > 0)" exit 1 fi - timeout: 20 \ No newline at end of file + timeout: 20 diff --git a/tests/templates/kuttl/fault-tolerant-execution/check-fte.py b/tests/templates/kuttl/fault-tolerant-execution/check-fte.py index 8685cb1f..f4ecb339 100644 --- a/tests/templates/kuttl/fault-tolerant-execution/check-fte.py +++ b/tests/templates/kuttl/fault-tolerant-execution/check-fte.py @@ -2,6 +2,7 @@ import trino import argparse + def get_connection(coordinator): """Create anonymous connection for basic cluster health check""" conn = trino.dbapi.connect( @@ -14,6 +15,7 @@ def get_connection(coordinator): ) return conn + if __name__ == "__main__": # Construct an argument parser all_args = argparse.ArgumentParser() @@ -89,5 +91,6 @@ def get_connection(coordinator): except Exception as e: print(f"Test failed with error: {e}") import traceback + traceback.print_exc() exit(-1) From ee145e00b886666740e043df515a61709042531f Mon Sep 17 00:00:00 2001 From: Lukas Krug Date: Wed, 6 Aug 2025 13:01:51 +0200 Subject: [PATCH 07/26] Update docs/modules/trino/pages/usage-guide/fault-tolerant-execution.adoc Co-authored-by: Sebastian Bernauer --- .../trino/pages/usage-guide/fault-tolerant-execution.adoc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/modules/trino/pages/usage-guide/fault-tolerant-execution.adoc b/docs/modules/trino/pages/usage-guide/fault-tolerant-execution.adoc index 55c38a29..9b82c474 100644 --- a/docs/modules/trino/pages/usage-guide/fault-tolerant-execution.adoc +++ b/docs/modules/trino/pages/usage-guide/fault-tolerant-execution.adoc @@ -15,7 +15,7 @@ Take a look at the link:https://trino.io/docs/current/admin/fault-tolerant-execu == Configuration -Fault-tolerant execution is turned off by default. +Fault-tolerant execution is not enabled by default. To enable the feature, you need to configure it in your `TrinoCluster` resource by adding a `faultTolerantExecution` section to the cluster configuration: [source,yaml] From 47d2ef57d9fe86cfa48596811649bc6520a1e0c4 Mon Sep 17 00:00:00 2001 From: dervoeti Date: Wed, 6 Aug 2025 14:30:35 +0200 Subject: [PATCH 08/26] fix: fixed review feedback --- .../usage-guide/fault-tolerant-execution.yaml | 5 ++--- .../usage-guide/fault-tolerant-execution.adoc | 18 +++++++++++++++--- .../src/crd/fault_tolerant_execution.rs | 14 +++++--------- ....yaml => 01_helm-bitnami-minio-values.yaml} | 0 .../{check-fte.py => 04_check-fte.py} | 0 5 files changed, 22 insertions(+), 15 deletions(-) rename examples/simple-trino-cluster-fault-tolerant-execution.yaml => docs/modules/trino/examples/usage-guide/fault-tolerant-execution.yaml (95%) rename tests/templates/kuttl/fault-tolerant-execution/{helm-bitnami-minio-values.yaml => 01_helm-bitnami-minio-values.yaml} (100%) rename tests/templates/kuttl/fault-tolerant-execution/{check-fte.py => 04_check-fte.py} (100%) diff --git a/examples/simple-trino-cluster-fault-tolerant-execution.yaml b/docs/modules/trino/examples/usage-guide/fault-tolerant-execution.yaml similarity index 95% rename from examples/simple-trino-cluster-fault-tolerant-execution.yaml rename to docs/modules/trino/examples/usage-guide/fault-tolerant-execution.yaml index 4e8d3f19..b2a768b4 100644 --- a/examples/simple-trino-cluster-fault-tolerant-execution.yaml +++ b/docs/modules/trino/examples/usage-guide/fault-tolerant-execution.yaml @@ -1,5 +1,3 @@ -# stackablectl operator install commons secret listener trino -# helm install minio minio --repo https://charts.bitnami.com/bitnami --version 15.0.7 --set auth.rootUser=minio-access-key --set auth.rootPassword=minio-secret-key --set tls.enabled=true --set tls.existingSecret=minio-tls-certificates --set provisioning.enabled=true --set provisioning.buckets[0].name=trino-exchange-bucket --- apiVersion: trino.stackable.tech/v1alpha1 kind: TrinoCluster @@ -17,7 +15,7 @@ spec: taskRetryAttemptsPerTask: 4 retryInitialDelay: 10s retryMaxDelay: 60s - retryDelayScaleFactor: "2.0" + retryDelayScaleFactor: 2.0 exchangeDeduplicationBufferSize: 64MB exchangeEncryptionEnabled: true exchangeManager: @@ -107,3 +105,4 @@ metadata: spec: connector: tpch: {} + diff --git a/docs/modules/trino/pages/usage-guide/fault-tolerant-execution.adoc b/docs/modules/trino/pages/usage-guide/fault-tolerant-execution.adoc index 9b82c474..eb95927d 100644 --- a/docs/modules/trino/pages/usage-guide/fault-tolerant-execution.adoc +++ b/docs/modules/trino/pages/usage-guide/fault-tolerant-execution.adoc @@ -61,6 +61,7 @@ This policy is recommended when executing large batch queries, as the cluster ca IMPORTANT: A `TASK` retry policy is best suited for long-running queries, but this policy can result in higher latency for short-running queries executed in high volume. As a best practice, it is recommended to run a dedicated cluster with a `TASK` retry policy for large batch queries, separate from another cluster that handles short queries. +There are tools that can help you achieve this by automatically routing queries based on certain criteria (such as query estimates or user) to different Trino clusters. Notable mentions are link:https://github.com/stackabletech/trino-lb[trino-lb {external-link-icon}^] and link:https://github.com/trinodb/trino-gateway[trino-gateway {external-link-icon}^]. [source,yaml] ---- @@ -273,7 +274,7 @@ spec: - ReadWriteOnce resources: requests: - storage: 10Gi + storage: 50Gi ---- == Connector support @@ -283,6 +284,17 @@ Take a look at the link:https://trino.io/docs/current/admin/fault-tolerant-execu When using connectors that do not explicitly support fault-tolerant execution, you may encounter a "This connector does not support query retries" error message. -== Examples +== Example -* link:https://github.com/stackabletech/trino-operator/blob/main/examples/simple-trino-cluster-fault-tolerant-execution.yaml[TrinoCluster with TASK retry policy and S3 exchange manager {external-link-icon}^] +Here's an example of a Trino cluster with fault-tolerant execution enabled using the `TASK` retry policy and MinIO backed S3 as the exchange manager: + +[source,bash] +---- +stackablectl operator install commons secret listener trino +helm install minio minio --repo https://charts.bitnami.com/bitnami --version 15.0.7 --set auth.rootUser=minio-access-key --set auth.rootPassword=minio-secret-key --set tls.enabled=true --set tls.existingSecret=minio-tls-certificates --set provisioning.enabled=true --set provisioning.buckets[0].name=trino-exchange-bucket +---- + +[source,yaml] +---- +include::example$usage-guide/fault-tolerant-execution.yaml[] +---- diff --git a/rust/operator-binary/src/crd/fault_tolerant_execution.rs b/rust/operator-binary/src/crd/fault_tolerant_execution.rs index f39340a5..31da6617 100644 --- a/rust/operator-binary/src/crd/fault_tolerant_execution.rs +++ b/rust/operator-binary/src/crd/fault_tolerant_execution.rs @@ -25,7 +25,7 @@ use crate::{ crd::{CONFIG_DIR_NAME, STACKABLE_CLIENT_TLS_DIR}, }; -#[derive(Clone, Debug, Deserialize, Eq, JsonSchema, PartialEq, Serialize)] +#[derive(Clone, Debug, Deserialize, JsonSchema, PartialEq, Serialize)] #[serde(rename_all = "camelCase")] pub struct FaultTolerantExecutionConfig { /// The retry policy for fault tolerant execution. @@ -58,7 +58,7 @@ pub struct FaultTolerantExecutionConfig { /// Factor by which retry delay is increased on each query or task failure. #[serde(skip_serializing_if = "Option::is_none")] - pub retry_delay_scale_factor: Option, + pub retry_delay_scale_factor: Option, /// Data size of the coordinator's in-memory buffer used to store output of query stages. #[serde(skip_serializing_if = "Option::is_none")] @@ -70,7 +70,7 @@ pub struct FaultTolerantExecutionConfig { } #[derive(Clone, Debug, Deserialize, Eq, JsonSchema, PartialEq, Serialize)] -#[serde(rename_all = "SCREAMING_SNAKE_CASE")] +#[serde(rename_all = "camelCase")] pub enum RetryPolicy { /// Retry entire queries on failure Query, @@ -117,16 +117,12 @@ pub struct ExchangeManagerGeneralConfig { #[serde(rename_all = "camelCase")] pub enum ExchangeManagerBackend { /// S3-compatible storage configuration (includes AWS S3, MinIO, GCS). - #[serde(rename = "s3")] S3(S3ExchangeConfig), /// Azure Blob Storage configuration. - #[serde(rename = "azure")] Azure(AzureExchangeConfig), /// HDFS-based exchange manager. - #[serde(rename = "hdfs")] Hdfs(HdfsExchangeConfig), /// Local filesystem storage (not recommended for production). - #[serde(rename = "local")] Local(LocalExchangeConfig), } @@ -646,7 +642,7 @@ mod tests { task_retry_attempts_per_task: None, retry_initial_delay: Some(Duration::from_secs(15)), retry_max_delay: Some(Duration::from_secs(90)), - retry_delay_scale_factor: Some("3.0".to_string()), + retry_delay_scale_factor: Some(3.0), exchange_deduplication_buffer_size: Some("64MB".to_string()), exchange_encryption_enabled: Some(false), }; @@ -674,7 +670,7 @@ mod tests { ); assert_eq!( fte_config.config_properties.get("retry-delay-scale-factor"), - Some(&"3.0".to_string()) + Some(&"3".to_string()) ); assert_eq!( fte_config diff --git a/tests/templates/kuttl/fault-tolerant-execution/helm-bitnami-minio-values.yaml b/tests/templates/kuttl/fault-tolerant-execution/01_helm-bitnami-minio-values.yaml similarity index 100% rename from tests/templates/kuttl/fault-tolerant-execution/helm-bitnami-minio-values.yaml rename to tests/templates/kuttl/fault-tolerant-execution/01_helm-bitnami-minio-values.yaml diff --git a/tests/templates/kuttl/fault-tolerant-execution/check-fte.py b/tests/templates/kuttl/fault-tolerant-execution/04_check-fte.py similarity index 100% rename from tests/templates/kuttl/fault-tolerant-execution/check-fte.py rename to tests/templates/kuttl/fault-tolerant-execution/04_check-fte.py From 056f84be7798736658278046982367a7f373fe9b Mon Sep 17 00:00:00 2001 From: dervoeti Date: Wed, 6 Aug 2025 14:31:32 +0200 Subject: [PATCH 09/26] feat!: remove explicit Azure and GCS support --- deploy/helm/trino-operator/crds/crds.yaml | 60 +------ .../src/crd/fault_tolerant_execution.rs | 147 +----------------- 2 files changed, 8 insertions(+), 199 deletions(-) diff --git a/deploy/helm/trino-operator/crds/crds.yaml b/deploy/helm/trino-operator/crds/crds.yaml index aa8dfd02..8e82b0d7 100644 --- a/deploy/helm/trino-operator/crds/crds.yaml +++ b/deploy/helm/trino-operator/crds/crds.yaml @@ -123,46 +123,11 @@ spec: oneOf: - required: - s3 - - required: - - azure - required: - hdfs - required: - local properties: - azure: - description: Azure Blob Storage configuration. - properties: - baseDirectories: - description: Azure Blob Storage container URIs for spooling data. - items: - type: string - type: array - blockSize: - description: Block data size for Azure block blob parallel upload. - nullable: true - type: string - endpoint: - description: Azure blob endpoint URL (optional, used instead of connection string). - nullable: true - type: string - key: - description: Key name in the Secret that contains the connection string. - nullable: true - type: string - maxErrorRetries: - description: Maximum number of times the Azure client should retry a request. - format: uint32 - minimum: 0.0 - nullable: true - type: integer - secretClass: - description: '[SecretClass](https://docs.stackable.tech/home/nightly/secret-operator/secretclass) providing the Azure connection string.' - type: string - required: - - baseDirectories - - secretClass - type: object hdfs: description: HDFS-based exchange manager. properties: @@ -204,10 +169,10 @@ spec: - baseDirectories type: object s3: - description: S3-compatible storage configuration (includes AWS S3, MinIO, GCS). + description: S3-compatible storage configuration. properties: baseDirectories: - description: S3 bucket URIs for spooling data (e.g., s3://bucket1,s3://bucket2). For GCS, use gs:// URIs (e.g., gs://bucket1,gs://bucket2). + description: S3 bucket URIs for spooling data (e.g., s3://bucket1,s3://bucket2). items: type: string type: array @@ -337,20 +302,6 @@ spec: description: External ID for the IAM role trust policy. nullable: true type: string - gcsServiceAccountKey: - description: Google Cloud Storage service account key in JSON format. Required when using GCS (gs:// URIs). Should contain the JSON service account key. The operator will mount this as a file and configure `exchange.gcs.json-key-file-path`. - nullable: true - properties: - key: - description: Key name in the Secret that contains the JSON service account key. - nullable: true - type: string - secretClass: - description: '[SecretClass](https://docs.stackable.tech/home/nightly/secret-operator/secretclass) providing the GCS service account key.' - type: string - required: - - secretClass - type: object iamRole: description: IAM role to assume for S3 access. nullable: true @@ -400,8 +351,9 @@ spec: type: integer retryDelayScaleFactor: description: Factor by which retry delay is increased on each query or task failure. + format: float nullable: true - type: string + type: number retryInitialDelay: description: Minimum time that a failed query or task must wait before it is retried. nullable: true @@ -413,8 +365,8 @@ spec: retryPolicy: description: The retry policy for fault tolerant execution. `QUERY` retries entire queries, `TASK` retries individual tasks. When set to `TASK`, an exchange manager must be configured. enum: - - QUERY - - TASK + - query + - task type: string taskRetryAttemptsPerTask: description: Maximum number of times Trino may attempt to retry a single task before declaring the query failed. Only applies to `TASK` retry policy. diff --git a/rust/operator-binary/src/crd/fault_tolerant_execution.rs b/rust/operator-binary/src/crd/fault_tolerant_execution.rs index 31da6617..aaa192bd 100644 --- a/rust/operator-binary/src/crd/fault_tolerant_execution.rs +++ b/rust/operator-binary/src/crd/fault_tolerant_execution.rs @@ -10,7 +10,7 @@ use std::collections::BTreeMap; use serde::{Deserialize, Serialize}; use snafu::Snafu; use stackable_operator::{ - builder::pod::volume::{SecretOperatorVolumeSourceBuilder, VolumeBuilder, VolumeMountBuilder}, + builder::pod::volume::{VolumeBuilder, VolumeMountBuilder}, client::Client, commons::tls_verification::{CaCert, TlsServerVerification, TlsVerification}, crd::s3, @@ -116,10 +116,8 @@ pub struct ExchangeManagerGeneralConfig { #[derive(Clone, Debug, Deserialize, Eq, JsonSchema, PartialEq, Serialize)] #[serde(rename_all = "camelCase")] pub enum ExchangeManagerBackend { - /// S3-compatible storage configuration (includes AWS S3, MinIO, GCS). + /// S3-compatible storage configuration. S3(S3ExchangeConfig), - /// Azure Blob Storage configuration. - Azure(AzureExchangeConfig), /// HDFS-based exchange manager. Hdfs(HdfsExchangeConfig), /// Local filesystem storage (not recommended for production). @@ -130,7 +128,6 @@ pub enum ExchangeManagerBackend { #[serde(rename_all = "camelCase")] pub struct S3ExchangeConfig { /// S3 bucket URIs for spooling data (e.g., s3://bucket1,s3://bucket2). - /// For GCS, use gs:// URIs (e.g., gs://bucket1,gs://bucket2). pub base_directories: Vec, /// S3 connection configuration. /// Learn more about S3 configuration in the [S3 concept docs](DOCS_BASE_URL_PLACEHOLDER/concepts/s3). @@ -147,42 +144,6 @@ pub struct S3ExchangeConfig { /// Part data size for S3 multi-part upload. #[serde(skip_serializing_if = "Option::is_none")] pub upload_part_size: Option, - /// Google Cloud Storage service account key in JSON format. - /// Required when using GCS (gs:// URIs). Should contain the JSON service account key. - /// The operator will mount this as a file and configure `exchange.gcs.json-key-file-path`. - #[serde(skip_serializing_if = "Option::is_none")] - pub gcs_service_account_key: Option, -} - -#[derive(Clone, Debug, Deserialize, Eq, JsonSchema, PartialEq, Serialize)] -#[serde(rename_all = "camelCase")] -pub struct GcsServiceAccountKey { - /// [SecretClass](DOCS_BASE_URL_PLACEHOLDER/secret-operator/secretclass) providing the GCS service account key. - pub secret_class: String, - /// Key name in the Secret that contains the JSON service account key. - #[serde(skip_serializing_if = "Option::is_none")] - pub key: Option, -} - -#[derive(Clone, Debug, Deserialize, Eq, JsonSchema, PartialEq, Serialize)] -#[serde(rename_all = "camelCase")] -pub struct AzureExchangeConfig { - /// Azure Blob Storage container URIs for spooling data. - pub base_directories: Vec, - /// [SecretClass](DOCS_BASE_URL_PLACEHOLDER/secret-operator/secretclass) providing the Azure connection string. - pub secret_class: String, - /// Key name in the Secret that contains the connection string. - #[serde(skip_serializing_if = "Option::is_none")] - pub key: Option, - /// Azure blob endpoint URL (optional, used instead of connection string). - #[serde(skip_serializing_if = "Option::is_none")] - pub endpoint: Option, - /// Block data size for Azure block blob parallel upload. - #[serde(skip_serializing_if = "Option::is_none")] - pub block_size: Option, - /// Maximum number of times the Azure client should retry a request. - #[serde(skip_serializing_if = "Option::is_none")] - pub max_error_retries: Option, } #[derive(Clone, Debug, Deserialize, Eq, JsonSchema, PartialEq, Serialize)] @@ -219,11 +180,6 @@ pub enum Error { #[snafu(display("trino does not support disabling the TLS verification of S3 servers"))] S3TlsNoVerificationNotSupported, - - #[snafu(display("Failed to build Azure SecretClass volume"))] - AzureSecretClassVolumeBuild { - source: stackable_operator::builder::pod::volume::SecretOperatorVolumeSourceBuilderError, - }, } /// Fault tolerant execution configuration with external resources resolved @@ -366,32 +322,6 @@ impl ResolvedFaultTolerantExecutionConfig { s3_config.upload_part_size.as_ref(), ); } - ExchangeManagerBackend::Azure(azure_config) => { - exchange_manager_properties.insert( - "exchange-manager.name".to_string(), - "filesystem".to_string(), - ); - exchange_manager_properties.insert( - "exchange.base-directories".to_string(), - azure_config.base_directories.join(","), - ); - - Self::insert_if_present( - &mut exchange_manager_properties, - "exchange.azure.endpoint", - azure_config.endpoint.as_ref(), - ); - Self::insert_if_present( - &mut exchange_manager_properties, - "exchange.azure.block-size", - azure_config.block_size.as_ref(), - ); - Self::insert_if_present( - &mut exchange_manager_properties, - "exchange.azure.max-error-retries", - azure_config.max_error_retries, - ); - } ExchangeManagerBackend::Hdfs(hdfs_config) => { exchange_manager_properties .insert("exchange-manager.name".to_string(), "hdfs".to_string()); @@ -448,9 +378,6 @@ impl ResolvedFaultTolerantExecutionConfig { .resolve_s3_backend(s3_config, client, namespace) .await?; } - ExchangeManagerBackend::Azure(azure_config) => { - resolved_config.resolve_azure_backend(azure_config).await?; - } ExchangeManagerBackend::Hdfs(hdfs_config) => { resolved_config.resolve_hdfs_backend(hdfs_config); } @@ -541,75 +468,6 @@ impl ResolvedFaultTolerantExecutionConfig { } } - if let Some(gcs_key_config) = &s3_config.gcs_service_account_key { - let gcs_secret_mount_dir = format!("{CONFIG_DIR_NAME}/exchange-gcs-key"); - let volume_name = "exchange-gcs-key".to_string(); - let default_key_name = "key.json".to_string(); - let key_name = gcs_key_config.key.as_ref().unwrap_or(&default_key_name); - - let secret_volume_source = - SecretOperatorVolumeSourceBuilder::new(&gcs_key_config.secret_class) - .build() - .context(AzureSecretClassVolumeBuildSnafu)?; - - self.volumes.push( - VolumeBuilder::new(&volume_name) - .ephemeral(secret_volume_source) - .build(), - ); - self.volume_mounts.push( - VolumeMountBuilder::new(&volume_name, &gcs_secret_mount_dir) - .read_only(true) - .build(), - ); - - let json_key_file_path = format!("{gcs_secret_mount_dir}/{key_name}"); - self.exchange_manager_properties.insert( - "exchange.gcs.json-key-file-path".to_string(), - json_key_file_path, - ); - } - - Ok(()) - } - - async fn resolve_azure_backend( - &mut self, - azure_config: &AzureExchangeConfig, - ) -> Result<(), Error> { - use snafu::ResultExt; - - let azure_secret_mount_dir = format!("{CONFIG_DIR_NAME}/exchange-azure-secret"); - let volume_name = "exchange-azure-secret".to_string(); - let default_key_name = "connectionString".to_string(); - let key_name = azure_config.key.as_ref().unwrap_or(&default_key_name); - - let secret_volume_source = - SecretOperatorVolumeSourceBuilder::new(&azure_config.secret_class) - .build() - .context(AzureSecretClassVolumeBuildSnafu)?; - - self.volumes.push( - VolumeBuilder::new(&volume_name) - .ephemeral(secret_volume_source) - .build(), - ); - self.volume_mounts.push( - VolumeMountBuilder::new(&volume_name, &azure_secret_mount_dir) - .read_only(true) - .build(), - ); - - let connection_string_env = "EXCHANGE_AZURE_CONNECTION_STRING".to_string(); - self.exchange_manager_properties.insert( - "exchange.azure.connection-string".to_string(), - format!("${{ENV:{connection_string_env}}}"), - ); - - let connection_string_path = format!("{azure_secret_mount_dir}/{key_name}"); - self.load_env_from_files - .insert(connection_string_env, connection_string_path); - Ok(()) } @@ -729,7 +587,6 @@ mod tests { external_id: Some("external-id-123".to_string()), max_error_retries: Some(5), upload_part_size: Some("10MB".to_string()), - gcs_service_account_key: None, }), }), query_retry_attempts: None, From aefba45e9cdc2f6dfb13d6e362f3c0b961d478bb Mon Sep 17 00:00:00 2001 From: dervoeti Date: Wed, 6 Aug 2025 16:05:29 +0200 Subject: [PATCH 10/26] feat: use PascalCase for Query/Task / allow configOverrides for exchange manager --- deploy/helm/trino-operator/crds/crds.yaml | 18 ++- .../usage-guide/fault-tolerant-execution.yaml | 2 +- .../usage-guide/fault-tolerant-execution.adoc | 114 +++--------------- .../src/crd/fault_tolerant_execution.rs | 87 +++++++++++-- .../02-install-trino.yaml.j2 | 2 +- 5 files changed, 109 insertions(+), 114 deletions(-) diff --git a/deploy/helm/trino-operator/crds/crds.yaml b/deploy/helm/trino-operator/crds/crds.yaml index 8e82b0d7..052e5917 100644 --- a/deploy/helm/trino-operator/crds/crds.yaml +++ b/deploy/helm/trino-operator/crds/crds.yaml @@ -118,7 +118,7 @@ spec: nullable: true type: boolean exchangeManager: - description: Exchange manager configuration for spooling intermediate data during fault tolerant execution. Required when using `TASK` retry policy, optional for `QUERY` retry policy. + description: Exchange manager configuration for spooling intermediate data during fault tolerant execution. Required when using `Task` retry policy, optional for `Query` retry policy. nullable: true oneOf: - required: @@ -128,6 +128,12 @@ spec: - required: - local properties: + configOverrides: + additionalProperties: + type: string + default: {} + description: The `configOverrides` allow overriding arbitrary exchange manager properties. + type: object hdfs: description: HDFS-based exchange manager. properties: @@ -344,7 +350,7 @@ spec: type: integer type: object queryRetryAttempts: - description: Maximum number of times Trino may attempt to retry a query before declaring it failed. Only applies to `QUERY` retry policy. + description: Maximum number of times Trino may attempt to retry a query before declaring it failed. Only applies to `Query` retry policy. format: uint32 minimum: 0.0 nullable: true @@ -363,13 +369,13 @@ spec: nullable: true type: string retryPolicy: - description: The retry policy for fault tolerant execution. `QUERY` retries entire queries, `TASK` retries individual tasks. When set to `TASK`, an exchange manager must be configured. + description: The retry policy for fault tolerant execution. `Query` retries entire queries, `Task` retries individual tasks. When set to `Task`, an exchange manager must be configured. enum: - - query - - task + - Query + - Task type: string taskRetryAttemptsPerTask: - description: Maximum number of times Trino may attempt to retry a single task before declaring the query failed. Only applies to `TASK` retry policy. + description: Maximum number of times Trino may attempt to retry a single task before declaring the query failed. Only applies to `Task` retry policy. format: uint32 minimum: 0.0 nullable: true diff --git a/docs/modules/trino/examples/usage-guide/fault-tolerant-execution.yaml b/docs/modules/trino/examples/usage-guide/fault-tolerant-execution.yaml index b2a768b4..d476428c 100644 --- a/docs/modules/trino/examples/usage-guide/fault-tolerant-execution.yaml +++ b/docs/modules/trino/examples/usage-guide/fault-tolerant-execution.yaml @@ -11,7 +11,7 @@ spec: matchLabels: trino: trino-fault-tolerant faultTolerantExecution: - retryPolicy: TASK + retryPolicy: Task taskRetryAttemptsPerTask: 4 retryInitialDelay: 10s retryMaxDelay: 60s diff --git a/docs/modules/trino/pages/usage-guide/fault-tolerant-execution.adoc b/docs/modules/trino/pages/usage-guide/fault-tolerant-execution.adoc index eb95927d..ecb2d2e3 100644 --- a/docs/modules/trino/pages/usage-guide/fault-tolerant-execution.adoc +++ b/docs/modules/trino/pages/usage-guide/fault-tolerant-execution.adoc @@ -23,20 +23,20 @@ To enable the feature, you need to configure it in your `TrinoCluster` resource spec: clusterConfig: faultTolerantExecution: - retryPolicy: QUERY # <1> + retryPolicy: Query # <1> queryRetryAttempts: 3 # <2> ---- -<1> The retry policy - either `QUERY` or `TASK` -<2> Maximum number of times to retry a query (QUERY policy only) +<1> The retry policy - either `Query` or `Task` +<2> Maximum number of times to retry a query (Query policy only) == Retry policies The `retryPolicy` configuration property designates whether Trino retries entire queries or a query's individual tasks in the event of failure. -=== QUERY retry policy +=== Query retry policy -A `QUERY` retry policy instructs Trino to automatically retry a query in the event of an error occurring on a worker node. -A `QUERY` retry policy is recommended when the majority of the Trino cluster's workload consists of many small queries. +A `Query` retry policy instructs Trino to automatically retry a query in the event of an error occurring on a worker node. +A `Query` retry policy is recommended when the majority of the Trino cluster's workload consists of many small queries. By default, Trino does not implement fault tolerance for queries whose result set exceeds 32MB in size. This limit can be increased by modifying the `exchangeDeduplicationBufferSize` configuration property to be greater than the default value of `32MB`, but this results in higher memory usage on the coordinator. @@ -47,20 +47,20 @@ This limit can be increased by modifying the `exchangeDeduplicationBufferSize` c spec: clusterConfig: faultTolerantExecution: - retryPolicy: QUERY + retryPolicy: Query queryRetryAttempts: 3 exchangeDeduplicationBufferSize: 64MB # Increased from default 32MB ... ---- -=== TASK retry policy +=== Task retry policy -A `TASK` retry policy instructs Trino to retry individual query tasks in the event of failure. +A `Task` retry policy instructs Trino to retry individual query tasks in the event of failure. You **must** configure an exchange manager to use the task retry policy. This policy is recommended when executing large batch queries, as the cluster can more efficiently retry smaller tasks within the query rather than retry the whole query. -IMPORTANT: A `TASK` retry policy is best suited for long-running queries, but this policy can result in higher latency for short-running queries executed in high volume. -As a best practice, it is recommended to run a dedicated cluster with a `TASK` retry policy for large batch queries, separate from another cluster that handles short queries. +IMPORTANT: A `Task` retry policy is best suited for long-running queries, but this policy can result in higher latency for short-running queries executed in high volume. +As a best practice, it is recommended to run a dedicated cluster with a `Task` retry policy for large batch queries, separate from another cluster that handles short queries. There are tools that can help you achieve this by automatically routing queries based on certain criteria (such as query estimates or user) to different Trino clusters. Notable mentions are link:https://github.com/stackabletech/trino-lb[trino-lb {external-link-icon}^] and link:https://github.com/trinodb/trino-gateway[trino-gateway {external-link-icon}^]. [source,yaml] @@ -68,7 +68,7 @@ There are tools that can help you achieve this by automatically routing queries spec: clusterConfig: faultTolerantExecution: - retryPolicy: TASK + retryPolicy: Task taskRetryAttemptsPerTask: 4 exchangeManager: s3: @@ -82,20 +82,20 @@ spec: == Exchange manager Exchange spooling is responsible for storing and managing spooled data for fault-tolerant execution. -You can configure a filesystem-based exchange manager that stores spooled data in a specified location, such as AWS S3 and S3-compatible systems, Azure Blob Storage or HDFS. +You can configure a filesystem-based exchange manager that stores spooled data in a specified location, such as AWS S3 and S3-compatible systems, HDFS, or local filesystem. -NOTE: An exchange manager is required when using the `TASK` retry policy and optional for the `QUERY` retry policy. +NOTE: An exchange manager is required when using the `Task` retry policy and optional for the `Query` retry policy. === S3-compatible storage -You can use S3-compatible storage systems for exchange spooling, including AWS S3, MinIO, and Google Cloud Storage. +You can use S3-compatible storage systems for exchange spooling, including AWS S3 and MinIO. [source,yaml] ---- spec: clusterConfig: faultTolerantExecution: - retryPolicy: TASK + retryPolicy: Task exchangeManager: s3: baseDirectories: # <1> @@ -122,81 +122,7 @@ spec: <1> Multiple S3 buckets can be specified to distribute I/O load <2> S3 connection defined as a reference to an xref:concepts:s3.adoc[S3Connection] resource -For Google Cloud Storage, you can use GCS buckets with S3 compatibility: - -[source,yaml] ----- -spec: - clusterConfig: - faultTolerantExecution: - exchangeManager: - s3: - baseDirectories: - - "gs://my-gcs-bucket/trino-spooling" - connection: - inline: - host: storage.googleapis.com - port: 443 - accessStyle: Path - credentials: - secretClass: gcs-hmac-credentials - tls: - verification: - server: - caCert: - webPki: {} - gcsServiceAccountKey: - secretClass: "gcs-service-account-secret-class" - key: "service-account.json" ----- - -=== Azure Blob Storage - -You can configure Azure Blob Storage as the exchange spooling destination: - -[source,yaml] ----- -spec: - clusterConfig: - faultTolerantExecution: - retryPolicy: TASK - exchangeManager: - azure: - baseDirectories: - - "abfs://exchange-container@mystorageaccount.dfs.core.windows.net/exchange-spooling" - secretClass: azure-credentials # <1> - key: connectionString # <2> ----- -<1> SecretClass providing the Azure connection string -<2> Key name in the Secret that contains the connection string (defaults to `connectionString`) - -The Azure connection string should be provided via a SecretClass that refers to a Kubernetes Secret containing the Azure storage account connection string, like this: - -[source,yaml] ----- -apiVersion: secrets.stackable.tech/v1alpha1 -kind: SecretClass -metadata: - name: azure-credentials -spec: - backend: - k8sSearch: - searchNamespace: - pod: {} ----- - -[source,yaml] ----- -apiVersion: v1 -kind: Secret -metadata: - name: azure-secret - labels: - secrets.stackable.tech/class: azure-credentials -type: Opaque -stringData: - connectionString: "DefaultEndpointsProtocol=https;AccountName=mystorageaccount;AccountKey=your_account_key;EndpointSuffix=core.windows.net" ----- +For storage systems like Google Cloud Storage or Azure Blob Storage, you can use the S3-compatible configuration with `configOverrides` to provide the necessary exchange manager properties. === HDFS storage @@ -207,7 +133,7 @@ You can configure HDFS as the exchange spooling destination: spec: clusterConfig: faultTolerantExecution: - retryPolicy: TASK + retryPolicy: Task exchangeManager: hdfs: baseDirectories: @@ -229,7 +155,7 @@ A local directory can only be used for exchange in a distributed cluster if the spec: clusterConfig: faultTolerantExecution: - retryPolicy: TASK + retryPolicy: Task exchangeManager: local: baseDirectories: @@ -286,7 +212,7 @@ When using connectors that do not explicitly support fault-tolerant execution, y == Example -Here's an example of a Trino cluster with fault-tolerant execution enabled using the `TASK` retry policy and MinIO backed S3 as the exchange manager: +Here's an example of a Trino cluster with fault-tolerant execution enabled using the `Task` retry policy and MinIO backed S3 as the exchange manager: [source,bash] ---- diff --git a/rust/operator-binary/src/crd/fault_tolerant_execution.rs b/rust/operator-binary/src/crd/fault_tolerant_execution.rs index aaa192bd..e0050583 100644 --- a/rust/operator-binary/src/crd/fault_tolerant_execution.rs +++ b/rust/operator-binary/src/crd/fault_tolerant_execution.rs @@ -5,7 +5,7 @@ //! //! Based on the Trino documentation: -use std::collections::BTreeMap; +use std::collections::{BTreeMap, HashMap}; use serde::{Deserialize, Serialize}; use snafu::Snafu; @@ -29,22 +29,22 @@ use crate::{ #[serde(rename_all = "camelCase")] pub struct FaultTolerantExecutionConfig { /// The retry policy for fault tolerant execution. - /// `QUERY` retries entire queries, `TASK` retries individual tasks. - /// When set to `TASK`, an exchange manager must be configured. + /// `Query` retries entire queries, `Task` retries individual tasks. + /// When set to `Task`, an exchange manager must be configured. pub retry_policy: RetryPolicy, /// Exchange manager configuration for spooling intermediate data during fault tolerant execution. - /// Required when using `TASK` retry policy, optional for `QUERY` retry policy. + /// Required when using `Task` retry policy, optional for `Query` retry policy. #[serde(skip_serializing_if = "Option::is_none")] pub exchange_manager: Option, /// Maximum number of times Trino may attempt to retry a query before declaring it failed. - /// Only applies to `QUERY` retry policy. + /// Only applies to `Query` retry policy. #[serde(skip_serializing_if = "Option::is_none")] pub query_retry_attempts: Option, /// Maximum number of times Trino may attempt to retry a single task before declaring the query failed. - /// Only applies to `TASK` retry policy. + /// Only applies to `Task` retry policy. #[serde(skip_serializing_if = "Option::is_none")] pub task_retry_attempts_per_task: Option, @@ -70,7 +70,7 @@ pub struct FaultTolerantExecutionConfig { } #[derive(Clone, Debug, Deserialize, Eq, JsonSchema, PartialEq, Serialize)] -#[serde(rename_all = "camelCase")] +#[serde(rename_all = "PascalCase")] pub enum RetryPolicy { /// Retry entire queries on failure Query, @@ -88,6 +88,10 @@ pub struct ExchangeManagerConfig { /// Backend-specific configuration. #[serde(flatten)] pub backend: ExchangeManagerBackend, + + /// The `configOverrides` allow overriding arbitrary exchange manager properties. + #[serde(default)] + pub config_overrides: HashMap, } #[derive(Clone, Debug, Deserialize, Eq, JsonSchema, PartialEq, Serialize)] @@ -170,7 +174,7 @@ pub struct LocalExchangeConfig { #[derive(Snafu, Debug)] pub enum Error { - #[snafu(display("Exchange manager is required when using TASK retry policy"))] + #[snafu(display("Exchange manager is required when using Task retry policy"))] ExchangeManagerRequiredForTaskPolicy, #[snafu(display("Failed to resolve S3 connection"))] @@ -224,8 +228,8 @@ impl ResolvedFaultTolerantExecutionConfig { let mut config_properties = BTreeMap::new(); let retry_policy = match config.retry_policy { - RetryPolicy::Query => "QUERY", - RetryPolicy::Task => "TASK", + RetryPolicy::Query => "Query", + RetryPolicy::Task => "Task", }; config_properties.insert("retry-policy".to_string(), retry_policy.to_string()); @@ -358,6 +362,8 @@ impl ResolvedFaultTolerantExecutionConfig { ); } } + + exchange_manager_properties.extend(exchange_config.config_overrides.clone()); } let mut resolved_config = Self { @@ -512,7 +518,7 @@ mod tests { assert_eq!( fte_config.config_properties.get("retry-policy"), - Some(&"QUERY".to_string()) + Some(&"Query".to_string()) ); assert_eq!( fte_config.config_properties.get("query-retry-attempts"), @@ -588,6 +594,7 @@ mod tests { max_error_retries: Some(5), upload_part_size: Some("10MB".to_string()), }), + config_overrides: std::collections::HashMap::new(), }), query_retry_attempts: None, task_retry_attempts_per_task: Some(2), @@ -605,7 +612,7 @@ mod tests { assert_eq!( fte_config.config_properties.get("retry-policy"), - Some(&"TASK".to_string()) + Some(&"Task".to_string()) ); assert_eq!( fte_config @@ -676,4 +683,60 @@ mod tests { Some(&"8".to_string()) ); } + + #[tokio::test] + async fn test_exchange_manager_config_overrides() { + let mut config_overrides = HashMap::new(); + config_overrides.insert("custom.property".to_string(), "custom-value".to_string()); + config_overrides.insert("exchange.s3.upload.part-size".to_string(), "overridden-value".to_string()); + + let config = FaultTolerantExecutionConfig { + retry_policy: RetryPolicy::Task, + exchange_manager: Some(ExchangeManagerConfig { + general: ExchangeManagerGeneralConfig { + sink_buffer_pool_min_size: None, + sink_buffers_per_partition: None, + sink_max_file_size: None, + source_concurrent_readers: None, + }, + backend: ExchangeManagerBackend::S3(S3ExchangeConfig { + base_directories: vec!["s3://my-bucket/exchange".to_string()], + connection: stackable_operator::crd::s3::v1alpha1::InlineConnectionOrReference::Reference( + "test-s3-connection".to_string() + ), + iam_role: None, + external_id: None, + max_error_retries: None, + upload_part_size: Some("original-value".to_string()), + }), + config_overrides, + }), + query_retry_attempts: None, + task_retry_attempts_per_task: Some(2), + retry_initial_delay: None, + retry_max_delay: None, + retry_delay_scale_factor: None, + exchange_deduplication_buffer_size: None, + exchange_encryption_enabled: None, + }; + + let fte_config = + ResolvedFaultTolerantExecutionConfig::from_config(&config, None, "default") + .await + .unwrap(); + + assert_eq!( + fte_config + .exchange_manager_properties + .get("custom.property"), + Some(&"custom-value".to_string()) + ); + + assert_eq!( + fte_config + .exchange_manager_properties + .get("exchange.s3.upload.part-size"), + Some(&"overridden-value".to_string()) + ); + } } diff --git a/tests/templates/kuttl/fault-tolerant-execution/02-install-trino.yaml.j2 b/tests/templates/kuttl/fault-tolerant-execution/02-install-trino.yaml.j2 index d08fad01..955b3ea2 100644 --- a/tests/templates/kuttl/fault-tolerant-execution/02-install-trino.yaml.j2 +++ b/tests/templates/kuttl/fault-tolerant-execution/02-install-trino.yaml.j2 @@ -18,7 +18,7 @@ spec: trino: trino-fte # Fault tolerant execution with S3/MinIO exchange manager faultTolerantExecution: - retryPolicy: TASK + retryPolicy: Task exchangeManager: s3: baseDirectories: From 499665c432c6e47f5e08e5a7a633e64e007e09a2 Mon Sep 17 00:00:00 2001 From: dervoeti Date: Wed, 6 Aug 2025 16:44:02 +0200 Subject: [PATCH 11/26] fix: always convert durations to seconds --- .../src/crd/fault_tolerant_execution.rs | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/rust/operator-binary/src/crd/fault_tolerant_execution.rs b/rust/operator-binary/src/crd/fault_tolerant_execution.rs index e0050583..e6d97dc8 100644 --- a/rust/operator-binary/src/crd/fault_tolerant_execution.rs +++ b/rust/operator-binary/src/crd/fault_tolerant_execution.rs @@ -246,12 +246,18 @@ impl ResolvedFaultTolerantExecutionConfig { Self::insert_if_present( &mut config_properties, "retry-initial-delay", - config.retry_initial_delay.as_ref(), + config + .retry_initial_delay + .as_ref() + .map(|d| format!("{}s", d.as_secs())), ); Self::insert_if_present( &mut config_properties, "retry-max-delay", - config.retry_max_delay.as_ref(), + config + .retry_max_delay + .as_ref() + .map(|d| format!("{}s", d.as_secs())), ); Self::insert_if_present( &mut config_properties, @@ -530,7 +536,7 @@ mod tests { ); assert_eq!( fte_config.config_properties.get("retry-max-delay"), - Some(&"1m30s".to_string()) + Some(&"90s".to_string()) ); assert_eq!( fte_config.config_properties.get("retry-delay-scale-factor"), @@ -688,7 +694,10 @@ mod tests { async fn test_exchange_manager_config_overrides() { let mut config_overrides = HashMap::new(); config_overrides.insert("custom.property".to_string(), "custom-value".to_string()); - config_overrides.insert("exchange.s3.upload.part-size".to_string(), "overridden-value".to_string()); + config_overrides.insert( + "exchange.s3.upload.part-size".to_string(), + "overridden-value".to_string(), + ); let config = FaultTolerantExecutionConfig { retry_policy: RetryPolicy::Task, From 9986801c3236673cfc3e334bc283794adf9d940c Mon Sep 17 00:00:00 2001 From: dervoeti Date: Thu, 7 Aug 2025 15:48:23 +0200 Subject: [PATCH 12/26] feat!: restructured CRD --- deploy/helm/trino-operator/crds/crds.yaml | 701 ++++++++++++------ .../usage-guide/fault-tolerant-execution.yaml | 38 +- .../usage-guide/fault-tolerant-execution.adoc | 95 +-- .../src/crd/fault_tolerant_execution.rs | 411 +++++----- .../01-install-minio.yaml | 2 +- .../02-install-trino.yaml.j2 | 14 +- .../04-copy-scripts.yaml | 2 +- .../fault-tolerant-execution/04_check-fte.py | 2 + .../05-run-tests.yaml | 5 +- 9 files changed, 794 insertions(+), 476 deletions(-) diff --git a/deploy/helm/trino-operator/crds/crds.yaml b/deploy/helm/trino-operator/crds/crds.yaml index 052e5917..19ee4127 100644 --- a/deploy/helm/trino-operator/crds/crds.yaml +++ b/deploy/helm/trino-operator/crds/crds.yaml @@ -108,280 +108,539 @@ spec: faultTolerantExecution: description: Fault tolerant execution configuration. When enabled, Trino can automatically retry queries or tasks in case of failures. nullable: true + oneOf: + - required: + - query + - required: + - task properties: - exchangeDeduplicationBufferSize: - description: Data size of the coordinator's in-memory buffer used to store output of query stages. - nullable: true - type: string - exchangeEncryptionEnabled: - description: Whether to enable encryption of spooling data. - nullable: true - type: boolean - exchangeManager: - description: Exchange manager configuration for spooling intermediate data during fault tolerant execution. Required when using `Task` retry policy, optional for `Query` retry policy. - nullable: true - oneOf: - - required: - - s3 - - required: - - hdfs - - required: - - local + query: + description: Query-level fault tolerant execution. Retries entire queries on failure. properties: - configOverrides: - additionalProperties: - type: string - default: {} - description: The `configOverrides` allow overriding arbitrary exchange manager properties. - type: object - hdfs: - description: HDFS-based exchange manager. + exchangeDeduplicationBufferSize: + description: Data size of the coordinator's in-memory buffer used to store output of query stages. + nullable: true + type: string + exchangeManager: + description: Exchange manager configuration for spooling intermediate data during fault tolerant execution. Optional for Query retry policy, recommended for large result sets. + nullable: true + oneOf: + - required: + - s3 + - required: + - hdfs + - required: + - local properties: - baseDirectories: - description: HDFS URIs for spooling data. - items: + configOverrides: + additionalProperties: type: string - type: array - blockSize: - description: Block data size for HDFS storage. + default: {} + description: The `configOverrides` allow overriding arbitrary exchange manager properties. + type: object + encryptionEnabled: + description: Whether to enable encryption of spooling data. nullable: true - type: string + type: boolean hdfs: - description: HDFS connection configuration. + description: HDFS-based exchange manager. + properties: + baseDirectories: + description: HDFS URIs for spooling data. + items: + type: string + type: array + blockSize: + description: Block data size for HDFS storage. + nullable: true + type: string + hdfs: + description: HDFS connection configuration. + properties: + configMap: + description: Name of the [discovery ConfigMap](https://docs.stackable.tech/home/nightly/concepts/service_discovery) providing information about the HDFS cluster. + type: string + required: + - configMap + type: object + skipDirectorySchemeValidation: + description: Skip directory scheme validation to support Hadoop-compatible file systems. + nullable: true + type: boolean + required: + - baseDirectories + - hdfs + type: object + local: + description: Local filesystem storage (not recommended for production). + properties: + baseDirectories: + description: Local filesystem paths for exchange storage. + items: + type: string + type: array + required: + - baseDirectories + type: object + s3: + description: S3-compatible storage configuration. properties: - configMap: - description: Name of the [discovery ConfigMap](https://docs.stackable.tech/home/nightly/concepts/service_discovery) providing information about the HDFS cluster. + baseDirectories: + description: S3 bucket URIs for spooling data (e.g., s3://bucket1,s3://bucket2). + items: + type: string + type: array + connection: + description: S3 connection configuration. Learn more about S3 configuration in the [S3 concept docs](https://docs.stackable.tech/home/nightly/concepts/s3). + oneOf: + - required: + - inline + - required: + - reference + properties: + inline: + description: S3 connection definition as a resource. Learn more on the [S3 concept documentation](https://docs.stackable.tech/home/nightly/concepts/s3). + properties: + accessStyle: + default: VirtualHosted + description: Which access style to use. Defaults to virtual hosted-style as most of the data products out there. Have a look at the [AWS documentation](https://docs.aws.amazon.com/AmazonS3/latest/userguide/VirtualHosting.html). + enum: + - Path + - VirtualHosted + type: string + credentials: + description: If the S3 uses authentication you have to specify you S3 credentials. In the most cases a [SecretClass](https://docs.stackable.tech/home/nightly/secret-operator/secretclass) providing `accessKey` and `secretKey` is sufficient. + nullable: true + properties: + scope: + description: '[Scope](https://docs.stackable.tech/home/nightly/secret-operator/scope) of the [SecretClass](https://docs.stackable.tech/home/nightly/secret-operator/secretclass).' + nullable: true + properties: + listenerVolumes: + default: [] + description: The listener volume scope allows Node and Service scopes to be inferred from the applicable listeners. This must correspond to Volume names in the Pod that mount Listeners. + items: + type: string + type: array + node: + default: false + description: The node scope is resolved to the name of the Kubernetes Node object that the Pod is running on. This will typically be the DNS name of the node. + type: boolean + pod: + default: false + description: The pod scope is resolved to the name of the Kubernetes Pod. This allows the secret to differentiate between StatefulSet replicas. + type: boolean + services: + default: [] + description: The service scope allows Pod objects to specify custom scopes. This should typically correspond to Service objects that the Pod participates in. + items: + type: string + type: array + type: object + secretClass: + description: '[SecretClass](https://docs.stackable.tech/home/nightly/secret-operator/secretclass) containing the LDAP bind credentials.' + type: string + required: + - secretClass + type: object + host: + description: 'Host of the S3 server without any protocol or port. For example: `west1.my-cloud.com`.' + type: string + port: + description: Port the S3 server listens on. If not specified the product will determine the port to use. + format: uint16 + minimum: 0.0 + nullable: true + type: integer + region: + default: + name: us-east-1 + description: |- + Bucket region used for signing headers (sigv4). + + This defaults to `us-east-1` which is compatible with other implementations such as Minio. + + WARNING: Some products use the Hadoop S3 implementation which falls back to us-east-2. + properties: + name: + default: us-east-1 + type: string + type: object + tls: + description: Use a TLS connection. If not specified no TLS will be used. + nullable: true + properties: + verification: + description: The verification method used to verify the certificates of the server and/or the client. + oneOf: + - required: + - none + - required: + - server + properties: + none: + description: Use TLS but don't verify certificates. + type: object + server: + description: Use TLS and a CA certificate to verify the server. + properties: + caCert: + description: CA cert to verify the server. + oneOf: + - required: + - webPki + - required: + - secretClass + properties: + secretClass: + description: Name of the [SecretClass](https://docs.stackable.tech/home/nightly/secret-operator/secretclass) which will provide the CA certificate. Note that a SecretClass does not need to have a key but can also work with just a CA certificate, so if you got provided with a CA cert but don't have access to the key you can still use this method. + type: string + webPki: + description: Use TLS and the CA certificates trusted by the common web browsers to verify the server. This can be useful when you e.g. use public AWS S3 or other public available services. + type: object + type: object + required: + - caCert + type: object + type: object + required: + - verification + type: object + required: + - host + type: object + reference: + type: string + type: object + externalId: + description: External ID for the IAM role trust policy. + nullable: true + type: string + iamRole: + description: IAM role to assume for S3 access. + nullable: true + type: string + maxErrorRetries: + description: Maximum number of times the S3 client should retry a request. + format: uint32 + minimum: 0.0 + nullable: true + type: integer + uploadPartSize: + description: Part data size for S3 multi-part upload. + nullable: true type: string required: - - configMap + - baseDirectories + - connection type: object - skipDirectorySchemeValidation: - description: Skip directory scheme validation to support Hadoop-compatible file systems. + sinkBufferPoolMinSize: + description: The minimum buffer pool size for an exchange sink. The larger the buffer pool size, the larger the write parallelism and memory usage. + format: uint32 + minimum: 0.0 nullable: true - type: boolean - required: - - baseDirectories - - hdfs - type: object - local: - description: Local filesystem storage (not recommended for production). - properties: - baseDirectories: - description: Local filesystem paths for exchange storage. - items: - type: string - type: array - required: - - baseDirectories + type: integer + sinkBuffersPerPartition: + description: The number of buffers per partition in the buffer pool. The larger the buffer pool size, the larger the write parallelism and memory usage. + format: uint32 + minimum: 0.0 + nullable: true + type: integer + sinkMaxFileSize: + description: Max data size of files written by exchange sinks. + nullable: true + type: string + sourceConcurrentReaders: + description: Number of concurrent readers to read from spooling storage. The larger the number of concurrent readers, the larger the read parallelism and memory usage. + format: uint32 + minimum: 0.0 + nullable: true + type: integer type: object - s3: - description: S3-compatible storage configuration. + retryAttempts: + description: Maximum number of times Trino may attempt to retry a query before declaring it failed. + format: uint32 + minimum: 0.0 + nullable: true + type: integer + retryDelayScaleFactor: + description: Factor by which retry delay is increased on each query failure. + format: float + nullable: true + type: number + retryInitialDelay: + description: Minimum time that a failed query must wait before it is retried. + nullable: true + type: string + retryMaxDelay: + description: Maximum time that a failed query must wait before it is retried. + nullable: true + type: string + type: object + task: + description: Task-level fault tolerant execution. Retries individual tasks on failure (requires exchange manager). + properties: + exchangeDeduplicationBufferSize: + description: Data size of the coordinator's in-memory buffer used to store output of query stages. + nullable: true + type: string + exchangeManager: + description: Exchange manager configuration for spooling intermediate data during fault tolerant execution. Required for Task retry policy. + oneOf: + - required: + - s3 + - required: + - hdfs + - required: + - local properties: - baseDirectories: - description: S3 bucket URIs for spooling data (e.g., s3://bucket1,s3://bucket2). - items: + configOverrides: + additionalProperties: type: string - type: array - connection: - description: S3 connection configuration. Learn more about S3 configuration in the [S3 concept docs](https://docs.stackable.tech/home/nightly/concepts/s3). - oneOf: - - required: - - inline - - required: - - reference + default: {} + description: The `configOverrides` allow overriding arbitrary exchange manager properties. + type: object + encryptionEnabled: + description: Whether to enable encryption of spooling data. + nullable: true + type: boolean + hdfs: + description: HDFS-based exchange manager. properties: - inline: - description: S3 connection definition as a resource. Learn more on the [S3 concept documentation](https://docs.stackable.tech/home/nightly/concepts/s3). + baseDirectories: + description: HDFS URIs for spooling data. + items: + type: string + type: array + blockSize: + description: Block data size for HDFS storage. + nullable: true + type: string + hdfs: + description: HDFS connection configuration. properties: - accessStyle: - default: VirtualHosted - description: Which access style to use. Defaults to virtual hosted-style as most of the data products out there. Have a look at the [AWS documentation](https://docs.aws.amazon.com/AmazonS3/latest/userguide/VirtualHosting.html). - enum: - - Path - - VirtualHosted + configMap: + description: Name of the [discovery ConfigMap](https://docs.stackable.tech/home/nightly/concepts/service_discovery) providing information about the HDFS cluster. type: string - credentials: - description: If the S3 uses authentication you have to specify you S3 credentials. In the most cases a [SecretClass](https://docs.stackable.tech/home/nightly/secret-operator/secretclass) providing `accessKey` and `secretKey` is sufficient. - nullable: true + required: + - configMap + type: object + skipDirectorySchemeValidation: + description: Skip directory scheme validation to support Hadoop-compatible file systems. + nullable: true + type: boolean + required: + - baseDirectories + - hdfs + type: object + local: + description: Local filesystem storage (not recommended for production). + properties: + baseDirectories: + description: Local filesystem paths for exchange storage. + items: + type: string + type: array + required: + - baseDirectories + type: object + s3: + description: S3-compatible storage configuration. + properties: + baseDirectories: + description: S3 bucket URIs for spooling data (e.g., s3://bucket1,s3://bucket2). + items: + type: string + type: array + connection: + description: S3 connection configuration. Learn more about S3 configuration in the [S3 concept docs](https://docs.stackable.tech/home/nightly/concepts/s3). + oneOf: + - required: + - inline + - required: + - reference + properties: + inline: + description: S3 connection definition as a resource. Learn more on the [S3 concept documentation](https://docs.stackable.tech/home/nightly/concepts/s3). properties: - scope: - description: '[Scope](https://docs.stackable.tech/home/nightly/secret-operator/scope) of the [SecretClass](https://docs.stackable.tech/home/nightly/secret-operator/secretclass).' + accessStyle: + default: VirtualHosted + description: Which access style to use. Defaults to virtual hosted-style as most of the data products out there. Have a look at the [AWS documentation](https://docs.aws.amazon.com/AmazonS3/latest/userguide/VirtualHosting.html). + enum: + - Path + - VirtualHosted + type: string + credentials: + description: If the S3 uses authentication you have to specify you S3 credentials. In the most cases a [SecretClass](https://docs.stackable.tech/home/nightly/secret-operator/secretclass) providing `accessKey` and `secretKey` is sufficient. nullable: true properties: - listenerVolumes: - default: [] - description: The listener volume scope allows Node and Service scopes to be inferred from the applicable listeners. This must correspond to Volume names in the Pod that mount Listeners. - items: - type: string - type: array - node: - default: false - description: The node scope is resolved to the name of the Kubernetes Node object that the Pod is running on. This will typically be the DNS name of the node. - type: boolean - pod: - default: false - description: The pod scope is resolved to the name of the Kubernetes Pod. This allows the secret to differentiate between StatefulSet replicas. - type: boolean - services: - default: [] - description: The service scope allows Pod objects to specify custom scopes. This should typically correspond to Service objects that the Pod participates in. - items: - type: string - type: array + scope: + description: '[Scope](https://docs.stackable.tech/home/nightly/secret-operator/scope) of the [SecretClass](https://docs.stackable.tech/home/nightly/secret-operator/secretclass).' + nullable: true + properties: + listenerVolumes: + default: [] + description: The listener volume scope allows Node and Service scopes to be inferred from the applicable listeners. This must correspond to Volume names in the Pod that mount Listeners. + items: + type: string + type: array + node: + default: false + description: The node scope is resolved to the name of the Kubernetes Node object that the Pod is running on. This will typically be the DNS name of the node. + type: boolean + pod: + default: false + description: The pod scope is resolved to the name of the Kubernetes Pod. This allows the secret to differentiate between StatefulSet replicas. + type: boolean + services: + default: [] + description: The service scope allows Pod objects to specify custom scopes. This should typically correspond to Service objects that the Pod participates in. + items: + type: string + type: array + type: object + secretClass: + description: '[SecretClass](https://docs.stackable.tech/home/nightly/secret-operator/secretclass) containing the LDAP bind credentials.' + type: string + required: + - secretClass type: object - secretClass: - description: '[SecretClass](https://docs.stackable.tech/home/nightly/secret-operator/secretclass) containing the LDAP bind credentials.' + host: + description: 'Host of the S3 server without any protocol or port. For example: `west1.my-cloud.com`.' type: string - required: - - secretClass - type: object - host: - description: 'Host of the S3 server without any protocol or port. For example: `west1.my-cloud.com`.' - type: string - port: - description: Port the S3 server listens on. If not specified the product will determine the port to use. - format: uint16 - minimum: 0.0 - nullable: true - type: integer - region: - default: - name: us-east-1 - description: |- - Bucket region used for signing headers (sigv4). + port: + description: Port the S3 server listens on. If not specified the product will determine the port to use. + format: uint16 + minimum: 0.0 + nullable: true + type: integer + region: + default: + name: us-east-1 + description: |- + Bucket region used for signing headers (sigv4). - This defaults to `us-east-1` which is compatible with other implementations such as Minio. + This defaults to `us-east-1` which is compatible with other implementations such as Minio. - WARNING: Some products use the Hadoop S3 implementation which falls back to us-east-2. - properties: - name: - default: us-east-1 - type: string - type: object - tls: - description: Use a TLS connection. If not specified no TLS will be used. - nullable: true - properties: - verification: - description: The verification method used to verify the certificates of the server and/or the client. - oneOf: - - required: - - none - - required: - - server + WARNING: Some products use the Hadoop S3 implementation which falls back to us-east-2. properties: - none: - description: Use TLS but don't verify certificates. - type: object - server: - description: Use TLS and a CA certificate to verify the server. + name: + default: us-east-1 + type: string + type: object + tls: + description: Use a TLS connection. If not specified no TLS will be used. + nullable: true + properties: + verification: + description: The verification method used to verify the certificates of the server and/or the client. + oneOf: + - required: + - none + - required: + - server properties: - caCert: - description: CA cert to verify the server. - oneOf: - - required: - - webPki - - required: - - secretClass + none: + description: Use TLS but don't verify certificates. + type: object + server: + description: Use TLS and a CA certificate to verify the server. properties: - secretClass: - description: Name of the [SecretClass](https://docs.stackable.tech/home/nightly/secret-operator/secretclass) which will provide the CA certificate. Note that a SecretClass does not need to have a key but can also work with just a CA certificate, so if you got provided with a CA cert but don't have access to the key you can still use this method. - type: string - webPki: - description: Use TLS and the CA certificates trusted by the common web browsers to verify the server. This can be useful when you e.g. use public AWS S3 or other public available services. + caCert: + description: CA cert to verify the server. + oneOf: + - required: + - webPki + - required: + - secretClass + properties: + secretClass: + description: Name of the [SecretClass](https://docs.stackable.tech/home/nightly/secret-operator/secretclass) which will provide the CA certificate. Note that a SecretClass does not need to have a key but can also work with just a CA certificate, so if you got provided with a CA cert but don't have access to the key you can still use this method. + type: string + webPki: + description: Use TLS and the CA certificates trusted by the common web browsers to verify the server. This can be useful when you e.g. use public AWS S3 or other public available services. + type: object type: object + required: + - caCert type: object - required: - - caCert type: object + required: + - verification type: object required: - - verification + - host type: object - required: - - host + reference: + type: string type: object - reference: + externalId: + description: External ID for the IAM role trust policy. + nullable: true + type: string + iamRole: + description: IAM role to assume for S3 access. + nullable: true + type: string + maxErrorRetries: + description: Maximum number of times the S3 client should retry a request. + format: uint32 + minimum: 0.0 + nullable: true + type: integer + uploadPartSize: + description: Part data size for S3 multi-part upload. + nullable: true type: string + required: + - baseDirectories + - connection type: object - externalId: - description: External ID for the IAM role trust policy. - nullable: true - type: string - iamRole: - description: IAM role to assume for S3 access. + sinkBufferPoolMinSize: + description: The minimum buffer pool size for an exchange sink. The larger the buffer pool size, the larger the write parallelism and memory usage. + format: uint32 + minimum: 0.0 nullable: true - type: string - maxErrorRetries: - description: Maximum number of times the S3 client should retry a request. + type: integer + sinkBuffersPerPartition: + description: The number of buffers per partition in the buffer pool. The larger the buffer pool size, the larger the write parallelism and memory usage. format: uint32 minimum: 0.0 nullable: true type: integer - uploadPartSize: - description: Part data size for S3 multi-part upload. + sinkMaxFileSize: + description: Max data size of files written by exchange sinks. nullable: true type: string - required: - - baseDirectories - - connection + sourceConcurrentReaders: + description: Number of concurrent readers to read from spooling storage. The larger the number of concurrent readers, the larger the read parallelism and memory usage. + format: uint32 + minimum: 0.0 + nullable: true + type: integer type: object - sinkBufferPoolMinSize: - description: The minimum buffer pool size for an exchange sink. The larger the buffer pool size, the larger the write parallelism and memory usage. + retryAttemptsPerTask: + description: Maximum number of times Trino may attempt to retry a single task before declaring the query failed. format: uint32 minimum: 0.0 nullable: true type: integer - sinkBuffersPerPartition: - description: The number of buffers per partition in the buffer pool. The larger the buffer pool size, the larger the write parallelism and memory usage. - format: uint32 - minimum: 0.0 + retryDelayScaleFactor: + description: Factor by which retry delay is increased on each task failure. + format: float nullable: true - type: integer - sinkMaxFileSize: - description: Max data size of files written by exchange sinks. + type: number + retryInitialDelay: + description: Minimum time that a failed task must wait before it is retried. nullable: true type: string - sourceConcurrentReaders: - description: Number of concurrent readers to read from spooling storage. The larger the number of concurrent readers, the larger the read parallelism and memory usage. - format: uint32 - minimum: 0.0 + retryMaxDelay: + description: Maximum time that a failed task must wait before it is retried. nullable: true - type: integer + type: string + required: + - exchangeManager type: object - queryRetryAttempts: - description: Maximum number of times Trino may attempt to retry a query before declaring it failed. Only applies to `Query` retry policy. - format: uint32 - minimum: 0.0 - nullable: true - type: integer - retryDelayScaleFactor: - description: Factor by which retry delay is increased on each query or task failure. - format: float - nullable: true - type: number - retryInitialDelay: - description: Minimum time that a failed query or task must wait before it is retried. - nullable: true - type: string - retryMaxDelay: - description: Maximum time that a failed query or task must wait before it is retried. - nullable: true - type: string - retryPolicy: - description: The retry policy for fault tolerant execution. `Query` retries entire queries, `Task` retries individual tasks. When set to `Task`, an exchange manager must be configured. - enum: - - Query - - Task - type: string - taskRetryAttemptsPerTask: - description: Maximum number of times Trino may attempt to retry a single task before declaring the query failed. Only applies to `Task` retry policy. - format: uint32 - minimum: 0.0 - nullable: true - type: integer - required: - - retryPolicy type: object tls: default: diff --git a/docs/modules/trino/examples/usage-guide/fault-tolerant-execution.yaml b/docs/modules/trino/examples/usage-guide/fault-tolerant-execution.yaml index d476428c..89125c0a 100644 --- a/docs/modules/trino/examples/usage-guide/fault-tolerant-execution.yaml +++ b/docs/modules/trino/examples/usage-guide/fault-tolerant-execution.yaml @@ -11,25 +11,25 @@ spec: matchLabels: trino: trino-fault-tolerant faultTolerantExecution: - retryPolicy: Task - taskRetryAttemptsPerTask: 4 - retryInitialDelay: 10s - retryMaxDelay: 60s - retryDelayScaleFactor: 2.0 - exchangeDeduplicationBufferSize: 64MB - exchangeEncryptionEnabled: true - exchangeManager: - sinkBufferPoolMinSize: 20 - sinkBuffersPerPartition: 4 - sinkMaxFileSize: 2GB - sourceConcurrentReaders: 8 - s3: - baseDirectories: - - "s3://trino-exchange-bucket/spooling" - connection: - reference: minio-connection - maxErrorRetries: 10 - uploadPartSize: 10MB + task: + retryAttemptsPerTask: 4 + retryInitialDelay: 10s + retryMaxDelay: 60s + retryDelayScaleFactor: 2.0 + exchangeDeduplicationBufferSize: 64MB + exchangeManager: + encryptionEnabled: true + sinkBufferPoolMinSize: 20 + sinkBuffersPerPartition: 4 + sinkMaxFileSize: 2GB + sourceConcurrentReaders: 8 + s3: + baseDirectories: + - "s3://trino-exchange-bucket/spooling" + connection: + reference: minio-connection + maxErrorRetries: 10 + uploadPartSize: 10MB coordinators: roleGroups: default: diff --git a/docs/modules/trino/pages/usage-guide/fault-tolerant-execution.adoc b/docs/modules/trino/pages/usage-guide/fault-tolerant-execution.adoc index ecb2d2e3..e4722188 100644 --- a/docs/modules/trino/pages/usage-guide/fault-tolerant-execution.adoc +++ b/docs/modules/trino/pages/usage-guide/fault-tolerant-execution.adoc @@ -16,51 +16,35 @@ Take a look at the link:https://trino.io/docs/current/admin/fault-tolerant-execu == Configuration Fault-tolerant execution is not enabled by default. -To enable the feature, you need to configure it in your `TrinoCluster` resource by adding a `faultTolerantExecution` section to the cluster configuration: - -[source,yaml] ----- -spec: - clusterConfig: - faultTolerantExecution: - retryPolicy: Query # <1> - queryRetryAttempts: 3 # <2> ----- -<1> The retry policy - either `Query` or `Task` -<2> Maximum number of times to retry a query (Query policy only) - -== Retry policies - -The `retryPolicy` configuration property designates whether Trino retries entire queries or a query's individual tasks in the event of failure. +To enable the feature, you need to configure it in your `TrinoCluster` resource by adding a `faultTolerantExecution` section to the cluster configuration. +The configuration uses a structured approach where you choose either `query` or `task` retry policy, each with their specific configuration options. === Query retry policy -A `Query` retry policy instructs Trino to automatically retry a query in the event of an error occurring on a worker node. -A `Query` retry policy is recommended when the majority of the Trino cluster's workload consists of many small queries. +A `query` retry policy instructs Trino to automatically retry a query in the event of an error occurring on a worker node. +This policy is recommended when the majority of the Trino cluster's workload consists of many small queries. By default, Trino does not implement fault tolerance for queries whose result set exceeds 32MB in size. This limit can be increased by modifying the `exchangeDeduplicationBufferSize` configuration property to be greater than the default value of `32MB`, but this results in higher memory usage on the coordinator. [source,yaml] ---- -... spec: clusterConfig: faultTolerantExecution: - retryPolicy: Query - queryRetryAttempts: 3 - exchangeDeduplicationBufferSize: 64MB # Increased from default 32MB -... + query: + retryAttempts: 3 + exchangeDeduplicationBufferSize: 64MB # Increased from default 32MB ---- === Task retry policy -A `Task` retry policy instructs Trino to retry individual query tasks in the event of failure. +A `task` retry policy instructs Trino to retry individual query tasks in the event of failure. You **must** configure an exchange manager to use the task retry policy. This policy is recommended when executing large batch queries, as the cluster can more efficiently retry smaller tasks within the query rather than retry the whole query. -IMPORTANT: A `Task` retry policy is best suited for long-running queries, but this policy can result in higher latency for short-running queries executed in high volume. -As a best practice, it is recommended to run a dedicated cluster with a `Task` retry policy for large batch queries, separate from another cluster that handles short queries. +IMPORTANT: A `task` retry policy is best suited for long-running queries, but this policy can result in higher latency for short-running queries executed in high volume. +As a best practice, it is recommended to run a dedicated cluster with a `task` retry policy for large batch queries, separate from another cluster that handles short queries. There are tools that can help you achieve this by automatically routing queries based on certain criteria (such as query estimates or user) to different Trino clusters. Notable mentions are link:https://github.com/stackabletech/trino-lb[trino-lb {external-link-icon}^] and link:https://github.com/trinodb/trino-gateway[trino-gateway {external-link-icon}^]. [source,yaml] @@ -68,14 +52,15 @@ There are tools that can help you achieve this by automatically routing queries spec: clusterConfig: faultTolerantExecution: - retryPolicy: Task - taskRetryAttemptsPerTask: 4 - exchangeManager: - s3: - baseDirectories: - - "s3://trino-exchange-bucket/spooling" - connection: - reference: my-s3-connection # <1> + task: + retryAttemptsPerTask: 4 + exchangeManager: # Mandatory for Task retry policy + encryptionEnabled: true + s3: + baseDirectories: + - "s3://trino-exchange-bucket/spooling" + connection: + reference: my-s3-connection # <1> ---- <1> Reference to an xref:concepts:s3.adoc[S3Connection] resource @@ -84,7 +69,7 @@ spec: Exchange spooling is responsible for storing and managing spooled data for fault-tolerant execution. You can configure a filesystem-based exchange manager that stores spooled data in a specified location, such as AWS S3 and S3-compatible systems, HDFS, or local filesystem. -NOTE: An exchange manager is required when using the `Task` retry policy and optional for the `Query` retry policy. +NOTE: An exchange manager is required when using the `task` retry policy and optional for the `query` retry policy. === S3-compatible storage @@ -95,13 +80,14 @@ You can use S3-compatible storage systems for exchange spooling, including AWS S spec: clusterConfig: faultTolerantExecution: - retryPolicy: Task - exchangeManager: - s3: - baseDirectories: # <1> - - "s3://exchange-bucket-1/trino-spooling" - connection: - reference: minio-s3-connection # <2> + task: + retryAttemptsPerTask: 4 + exchangeManager: + s3: + baseDirectories: # <1> + - "s3://exchange-bucket-1/trino-spooling" + connection: + reference: minio-s3-connection # <2> --- apiVersion: s3.stackable.tech/v1alpha1 kind: S3Connection @@ -133,13 +119,14 @@ You can configure HDFS as the exchange spooling destination: spec: clusterConfig: faultTolerantExecution: - retryPolicy: Task - exchangeManager: - hdfs: - baseDirectories: - - "hdfs://simple-hdfs/exchange-spooling" + task: + retryAttemptsPerTask: 4 + exchangeManager: hdfs: - configMap: simple-hdfs # <1> + baseDirectories: + - "hdfs://simple-hdfs/exchange-spooling" + hdfs: + configMap: simple-hdfs # <1> ---- <1> ConfigMap containing HDFS configuration files (created by the HDFS operator) @@ -155,11 +142,11 @@ A local directory can only be used for exchange in a distributed cluster if the spec: clusterConfig: faultTolerantExecution: - retryPolicy: Task - exchangeManager: - local: - baseDirectories: - - "/trino-exchange" + task: + exchangeManager: + local: + baseDirectories: + - "/trino-exchange" coordinators: roleGroups: default: @@ -212,7 +199,7 @@ When using connectors that do not explicitly support fault-tolerant execution, y == Example -Here's an example of a Trino cluster with fault-tolerant execution enabled using the `Task` retry policy and MinIO backed S3 as the exchange manager: +Here's an example of a Trino cluster with fault-tolerant execution enabled using the `task` retry policy and MinIO backed S3 as the exchange manager: [source,bash] ---- diff --git a/rust/operator-binary/src/crd/fault_tolerant_execution.rs b/rust/operator-binary/src/crd/fault_tolerant_execution.rs index e6d97dc8..7e40e7f7 100644 --- a/rust/operator-binary/src/crd/fault_tolerant_execution.rs +++ b/rust/operator-binary/src/crd/fault_tolerant_execution.rs @@ -27,36 +27,29 @@ use crate::{ #[derive(Clone, Debug, Deserialize, JsonSchema, PartialEq, Serialize)] #[serde(rename_all = "camelCase")] -pub struct FaultTolerantExecutionConfig { - /// The retry policy for fault tolerant execution. - /// `Query` retries entire queries, `Task` retries individual tasks. - /// When set to `Task`, an exchange manager must be configured. - pub retry_policy: RetryPolicy, - - /// Exchange manager configuration for spooling intermediate data during fault tolerant execution. - /// Required when using `Task` retry policy, optional for `Query` retry policy. - #[serde(skip_serializing_if = "Option::is_none")] - pub exchange_manager: Option, +pub enum FaultTolerantExecutionConfig { + /// Query-level fault tolerant execution. Retries entire queries on failure. + Query(QueryRetryConfig), + /// Task-level fault tolerant execution. Retries individual tasks on failure (requires exchange manager). + Task(TaskRetryConfig), +} +#[derive(Clone, Debug, Deserialize, JsonSchema, PartialEq, Serialize)] +#[serde(rename_all = "camelCase")] +pub struct QueryRetryConfig { /// Maximum number of times Trino may attempt to retry a query before declaring it failed. - /// Only applies to `Query` retry policy. - #[serde(skip_serializing_if = "Option::is_none")] - pub query_retry_attempts: Option, - - /// Maximum number of times Trino may attempt to retry a single task before declaring the query failed. - /// Only applies to `Task` retry policy. #[serde(skip_serializing_if = "Option::is_none")] - pub task_retry_attempts_per_task: Option, + pub retry_attempts: Option, - /// Minimum time that a failed query or task must wait before it is retried. + /// Minimum time that a failed query must wait before it is retried. #[serde(skip_serializing_if = "Option::is_none")] pub retry_initial_delay: Option, - /// Maximum time that a failed query or task must wait before it is retried. + /// Maximum time that a failed query must wait before it is retried. #[serde(skip_serializing_if = "Option::is_none")] pub retry_max_delay: Option, - /// Factor by which retry delay is increased on each query or task failure. + /// Factor by which retry delay is increased on each query failure. #[serde(skip_serializing_if = "Option::is_none")] pub retry_delay_scale_factor: Option, @@ -64,39 +57,47 @@ pub struct FaultTolerantExecutionConfig { #[serde(skip_serializing_if = "Option::is_none")] pub exchange_deduplication_buffer_size: Option, - /// Whether to enable encryption of spooling data. + /// Exchange manager configuration for spooling intermediate data during fault tolerant execution. + /// Optional for Query retry policy, recommended for large result sets. #[serde(skip_serializing_if = "Option::is_none")] - pub exchange_encryption_enabled: Option, -} - -#[derive(Clone, Debug, Deserialize, Eq, JsonSchema, PartialEq, Serialize)] -#[serde(rename_all = "PascalCase")] -pub enum RetryPolicy { - /// Retry entire queries on failure - Query, - /// Retry individual tasks on failure (requires exchange manager) - Task, + pub exchange_manager: Option, } -#[derive(Clone, Debug, Deserialize, Eq, JsonSchema, PartialEq, Serialize)] +#[derive(Clone, Debug, Deserialize, JsonSchema, PartialEq, Serialize)] #[serde(rename_all = "camelCase")] -pub struct ExchangeManagerConfig { - /// General exchange manager configuration that applies to all backends. - #[serde(flatten)] - pub general: ExchangeManagerGeneralConfig, +pub struct TaskRetryConfig { + /// Maximum number of times Trino may attempt to retry a single task before declaring the query failed. + #[serde(skip_serializing_if = "Option::is_none")] + pub retry_attempts_per_task: Option, - /// Backend-specific configuration. - #[serde(flatten)] - pub backend: ExchangeManagerBackend, + /// Minimum time that a failed task must wait before it is retried. + #[serde(skip_serializing_if = "Option::is_none")] + pub retry_initial_delay: Option, - /// The `configOverrides` allow overriding arbitrary exchange manager properties. - #[serde(default)] - pub config_overrides: HashMap, + /// Maximum time that a failed task must wait before it is retried. + #[serde(skip_serializing_if = "Option::is_none")] + pub retry_max_delay: Option, + + /// Factor by which retry delay is increased on each task failure. + #[serde(skip_serializing_if = "Option::is_none")] + pub retry_delay_scale_factor: Option, + + /// Data size of the coordinator's in-memory buffer used to store output of query stages. + #[serde(skip_serializing_if = "Option::is_none")] + pub exchange_deduplication_buffer_size: Option, + + /// Exchange manager configuration for spooling intermediate data during fault tolerant execution. + /// Required for Task retry policy. + pub exchange_manager: ExchangeManagerConfig, } -#[derive(Clone, Debug, Deserialize, Eq, JsonSchema, PartialEq, Serialize)] +#[derive(Clone, Debug, Deserialize, JsonSchema, PartialEq, Serialize)] #[serde(rename_all = "camelCase")] -pub struct ExchangeManagerGeneralConfig { +pub struct ExchangeManagerConfig { + /// Whether to enable encryption of spooling data. + #[serde(skip_serializing_if = "Option::is_none")] + pub encryption_enabled: Option, + /// The minimum buffer pool size for an exchange sink. The larger the buffer pool size, /// the larger the write parallelism and memory usage. #[serde(skip_serializing_if = "Option::is_none")] @@ -115,9 +116,17 @@ pub struct ExchangeManagerGeneralConfig { /// concurrent readers, the larger the read parallelism and memory usage. #[serde(skip_serializing_if = "Option::is_none")] pub source_concurrent_readers: Option, + + /// Backend-specific configuration. + #[serde(flatten)] + pub backend: ExchangeManagerBackend, + + /// The `configOverrides` allow overriding arbitrary exchange manager properties. + #[serde(default)] + pub config_overrides: HashMap, } -#[derive(Clone, Debug, Deserialize, Eq, JsonSchema, PartialEq, Serialize)] +#[derive(Clone, Debug, Deserialize, JsonSchema, PartialEq, Serialize)] #[serde(rename_all = "camelCase")] pub enum ExchangeManagerBackend { /// S3-compatible storage configuration. @@ -174,9 +183,6 @@ pub struct LocalExchangeConfig { #[derive(Snafu, Debug)] pub enum Error { - #[snafu(display("Exchange manager is required when using Task retry policy"))] - ExchangeManagerRequiredForTaskPolicy, - #[snafu(display("Failed to resolve S3 connection"))] S3Connection { source: s3::v1alpha1::ConnectionError, @@ -221,82 +227,112 @@ impl ResolvedFaultTolerantExecutionConfig { client: Option<&Client>, namespace: &str, ) -> Result { - if matches!(config.retry_policy, RetryPolicy::Task) && config.exchange_manager.is_none() { - return Err(Error::ExchangeManagerRequiredForTaskPolicy); - } - let mut config_properties = BTreeMap::new(); - let retry_policy = match config.retry_policy { - RetryPolicy::Query => "Query", - RetryPolicy::Task => "Task", + // Handle different retry policies and their configurations + let (retry_policy_str, exchange_manager_opt) = match config { + FaultTolerantExecutionConfig::Query(query_config) => { + // Set query-specific properties + Self::insert_if_present( + &mut config_properties, + "query-retry-attempts", + query_config.retry_attempts, + ); + Self::insert_if_present( + &mut config_properties, + "retry-initial-delay", + query_config + .retry_initial_delay + .as_ref() + .map(|d| format!("{}s", d.as_secs())), + ); + Self::insert_if_present( + &mut config_properties, + "retry-max-delay", + query_config + .retry_max_delay + .as_ref() + .map(|d| format!("{}s", d.as_secs())), + ); + Self::insert_if_present( + &mut config_properties, + "retry-delay-scale-factor", + query_config.retry_delay_scale_factor.as_ref(), + ); + Self::insert_if_present( + &mut config_properties, + "exchange.deduplication-buffer-size", + query_config.exchange_deduplication_buffer_size.as_ref(), + ); + + ("QUERY", query_config.exchange_manager.as_ref()) + } + FaultTolerantExecutionConfig::Task(task_config) => { + // Set task-specific properties + Self::insert_if_present( + &mut config_properties, + "task-retry-attempts-per-task", + task_config.retry_attempts_per_task, + ); + Self::insert_if_present( + &mut config_properties, + "retry-initial-delay", + task_config + .retry_initial_delay + .as_ref() + .map(|d| format!("{}s", d.as_secs())), + ); + Self::insert_if_present( + &mut config_properties, + "retry-max-delay", + task_config + .retry_max_delay + .as_ref() + .map(|d| format!("{}s", d.as_secs())), + ); + Self::insert_if_present( + &mut config_properties, + "retry-delay-scale-factor", + task_config.retry_delay_scale_factor.as_ref(), + ); + Self::insert_if_present( + &mut config_properties, + "exchange.deduplication-buffer-size", + task_config.exchange_deduplication_buffer_size.as_ref(), + ); + + ("TASK", Some(&task_config.exchange_manager)) + } }; - config_properties.insert("retry-policy".to_string(), retry_policy.to_string()); - - Self::insert_if_present( - &mut config_properties, - "query-retry-attempts", - config.query_retry_attempts, - ); - Self::insert_if_present( - &mut config_properties, - "task-retry-attempts-per-task", - config.task_retry_attempts_per_task, - ); - Self::insert_if_present( - &mut config_properties, - "retry-initial-delay", - config - .retry_initial_delay - .as_ref() - .map(|d| format!("{}s", d.as_secs())), - ); - Self::insert_if_present( - &mut config_properties, - "retry-max-delay", - config - .retry_max_delay - .as_ref() - .map(|d| format!("{}s", d.as_secs())), - ); - Self::insert_if_present( - &mut config_properties, - "retry-delay-scale-factor", - config.retry_delay_scale_factor.as_ref(), - ); - Self::insert_if_present( - &mut config_properties, - "exchange.deduplication-buffer-size", - config.exchange_deduplication_buffer_size.as_ref(), - ); - Self::insert_if_present( - &mut config_properties, - "fault-tolerant-execution.exchange-encryption-enabled", - config.exchange_encryption_enabled, - ); + + config_properties.insert("retry-policy".to_string(), retry_policy_str.to_string()); let mut exchange_manager_properties = BTreeMap::new(); - if let Some(exchange_config) = &config.exchange_manager { - // Add general properties + if let Some(exchange_config) = exchange_manager_opt { + Self::insert_if_present( + &mut config_properties, + "fault-tolerant-execution.exchange-encryption-enabled", + exchange_config.encryption_enabled, + ); Self::insert_if_present( &mut exchange_manager_properties, "exchange.sink-buffer-pool-min-size", - exchange_config.general.sink_buffer_pool_min_size, + exchange_config.sink_buffer_pool_min_size, ); Self::insert_if_present( &mut exchange_manager_properties, "exchange.sink-buffers-per-partition", - exchange_config.general.sink_buffers_per_partition, + exchange_config.sink_buffers_per_partition, ); Self::insert_if_present( &mut exchange_manager_properties, "exchange.sink-max-file-size", - exchange_config.general.sink_max_file_size.as_ref(), + exchange_config.sink_max_file_size.as_ref(), ); Self::insert_if_present( &mut exchange_manager_properties, "exchange.source-concurrent-readers", - exchange_config.general.source_concurrent_readers, + exchange_config.source_concurrent_readers, ); // Add backend-specific configuration @@ -383,7 +419,7 @@ impl ResolvedFaultTolerantExecutionConfig { // Resolve external resources if Kubernetes client is available // This should always be the case, except for when this function is called during unit tests - if let (Some(client), Some(exchange_config)) = (client, &config.exchange_manager) { + if let (Some(client), Some(exchange_config)) = (client, exchange_manager_opt) { match &exchange_config.backend { ExchangeManagerBackend::S3(s3_config) => { resolved_config @@ -505,17 +541,14 @@ mod tests { #[tokio::test] async fn test_query_retry_policy_without_exchange_manager() { - let config = FaultTolerantExecutionConfig { - retry_policy: RetryPolicy::Query, - exchange_manager: None, - query_retry_attempts: Some(5), - task_retry_attempts_per_task: None, + let config = FaultTolerantExecutionConfig::Query(QueryRetryConfig { + retry_attempts: Some(5), retry_initial_delay: Some(Duration::from_secs(15)), retry_max_delay: Some(Duration::from_secs(90)), retry_delay_scale_factor: Some(3.0), exchange_deduplication_buffer_size: Some("64MB".to_string()), - exchange_encryption_enabled: Some(false), - }; + exchange_manager: None, + }); let fte_config = ResolvedFaultTolerantExecutionConfig::from_config(&config, None, "default") @@ -524,7 +557,7 @@ mod tests { assert_eq!( fte_config.config_properties.get("retry-policy"), - Some(&"Query".to_string()) + Some(&"QUERY".to_string()) ); assert_eq!( fte_config.config_properties.get("query-retry-attempts"), @@ -548,48 +581,95 @@ mod tests { .get("exchange.deduplication-buffer-size"), Some(&"64MB".to_string()) ); + } + + #[tokio::test] + async fn test_query_retry_policy_with_exchange_manager() { + let config = FaultTolerantExecutionConfig::Query(QueryRetryConfig { + retry_attempts: Some(3), + retry_initial_delay: Some(Duration::from_secs(10)), + retry_max_delay: Some(Duration::from_secs(60)), + retry_delay_scale_factor: Some(2.0), + exchange_deduplication_buffer_size: Some("100MB".to_string()), + exchange_manager: Some(ExchangeManagerConfig { + encryption_enabled: Some(true), + sink_buffer_pool_min_size: Some(10), + sink_buffers_per_partition: Some(2), + sink_max_file_size: Some("1GB".to_string()), + source_concurrent_readers: Some(4), + backend: ExchangeManagerBackend::Local(LocalExchangeConfig { + base_directories: vec!["/tmp/exchange".to_string()], + }), + config_overrides: HashMap::new(), + }), + }); + + let fte_config = + ResolvedFaultTolerantExecutionConfig::from_config(&config, None, "default") + .await + .unwrap(); + + assert_eq!( + fte_config.config_properties.get("retry-policy"), + Some(&"QUERY".to_string()) + ); + assert_eq!( + fte_config.config_properties.get("query-retry-attempts"), + Some(&"3".to_string()) + ); + assert_eq!( + fte_config.config_properties.get("retry-initial-delay"), + Some(&"10s".to_string()) + ); + assert_eq!( + fte_config.config_properties.get("retry-max-delay"), + Some(&"60s".to_string()) + ); + assert_eq!( + fte_config.config_properties.get("retry-delay-scale-factor"), + Some(&"2".to_string()) + ); + + assert_eq!( + fte_config + .exchange_manager_properties + .get("exchange-manager.name"), + Some(&"filesystem".to_string()) + ); + assert_eq!( + fte_config + .exchange_manager_properties + .get("exchange.base-directories"), + Some(&"/tmp/exchange".to_string()) + ); + assert_eq!( + fte_config + .config_properties + .get("exchange.deduplication-buffer-size"), + Some(&"100MB".to_string()) + ); assert_eq!( fte_config .config_properties .get("fault-tolerant-execution.exchange-encryption-enabled"), - Some(&"false".to_string()) + Some(&"true".to_string()) ); - assert!(fte_config.exchange_manager_properties.is_empty()); } #[tokio::test] - async fn test_task_retry_policy_requires_exchange_manager() { - let config = FaultTolerantExecutionConfig { - retry_policy: RetryPolicy::Task, - exchange_manager: None, - query_retry_attempts: None, - task_retry_attempts_per_task: Some(3), + async fn test_task_retry_policy_with_s3_exchange_manager() { + let config = FaultTolerantExecutionConfig::Task(TaskRetryConfig { + retry_attempts_per_task: Some(2), retry_initial_delay: None, retry_max_delay: None, retry_delay_scale_factor: None, exchange_deduplication_buffer_size: None, - exchange_encryption_enabled: None, - }; - - let result = - ResolvedFaultTolerantExecutionConfig::from_config(&config, None, "default").await; - assert!(matches!( - result, - Err(Error::ExchangeManagerRequiredForTaskPolicy) - )); - } - - #[tokio::test] - async fn test_task_retry_policy_with_s3_exchange_manager() { - let config = FaultTolerantExecutionConfig { - retry_policy: RetryPolicy::Task, - exchange_manager: Some(ExchangeManagerConfig { - general: ExchangeManagerGeneralConfig { - sink_buffer_pool_min_size: Some(20), - sink_buffers_per_partition: Some(4), - sink_max_file_size: Some("2GB".to_string()), - source_concurrent_readers: Some(8), - }, + exchange_manager: ExchangeManagerConfig { + encryption_enabled: None, + sink_buffer_pool_min_size: Some(20), + sink_buffers_per_partition: Some(4), + sink_max_file_size: Some("2GB".to_string()), + source_concurrent_readers: Some(8), backend: ExchangeManagerBackend::S3(S3ExchangeConfig { base_directories: vec!["s3://my-bucket/exchange".to_string()], connection: stackable_operator::crd::s3::v1alpha1::InlineConnectionOrReference::Reference( @@ -601,15 +681,8 @@ mod tests { upload_part_size: Some("10MB".to_string()), }), config_overrides: std::collections::HashMap::new(), - }), - query_retry_attempts: None, - task_retry_attempts_per_task: Some(2), - retry_initial_delay: None, - retry_max_delay: None, - retry_delay_scale_factor: None, - exchange_deduplication_buffer_size: None, - exchange_encryption_enabled: None, - }; + }, + }); let fte_config = ResolvedFaultTolerantExecutionConfig::from_config(&config, None, "default") @@ -618,7 +691,7 @@ mod tests { assert_eq!( fte_config.config_properties.get("retry-policy"), - Some(&"Task".to_string()) + Some(&"TASK".to_string()) ); assert_eq!( fte_config @@ -699,15 +772,18 @@ mod tests { "overridden-value".to_string(), ); - let config = FaultTolerantExecutionConfig { - retry_policy: RetryPolicy::Task, - exchange_manager: Some(ExchangeManagerConfig { - general: ExchangeManagerGeneralConfig { - sink_buffer_pool_min_size: None, - sink_buffers_per_partition: None, - sink_max_file_size: None, - source_concurrent_readers: None, - }, + let config = FaultTolerantExecutionConfig::Task(TaskRetryConfig { + retry_attempts_per_task: Some(2), + retry_initial_delay: None, + retry_max_delay: None, + retry_delay_scale_factor: None, + exchange_deduplication_buffer_size: None, + exchange_manager: ExchangeManagerConfig { + encryption_enabled: None, + sink_buffer_pool_min_size: None, + sink_buffers_per_partition: None, + sink_max_file_size: None, + source_concurrent_readers: None, backend: ExchangeManagerBackend::S3(S3ExchangeConfig { base_directories: vec!["s3://my-bucket/exchange".to_string()], connection: stackable_operator::crd::s3::v1alpha1::InlineConnectionOrReference::Reference( @@ -719,15 +795,8 @@ mod tests { upload_part_size: Some("original-value".to_string()), }), config_overrides, - }), - query_retry_attempts: None, - task_retry_attempts_per_task: Some(2), - retry_initial_delay: None, - retry_max_delay: None, - retry_delay_scale_factor: None, - exchange_deduplication_buffer_size: None, - exchange_encryption_enabled: None, - }; + }, + }); let fte_config = ResolvedFaultTolerantExecutionConfig::from_config(&config, None, "default") diff --git a/tests/templates/kuttl/fault-tolerant-execution/01-install-minio.yaml b/tests/templates/kuttl/fault-tolerant-execution/01-install-minio.yaml index 2247b8f1..6475d861 100644 --- a/tests/templates/kuttl/fault-tolerant-execution/01-install-minio.yaml +++ b/tests/templates/kuttl/fault-tolerant-execution/01-install-minio.yaml @@ -6,6 +6,6 @@ commands: helm install minio --namespace $NAMESPACE --version 15.0.7 - -f helm-bitnami-minio-values.yaml + -f 01_helm-bitnami-minio-values.yaml oci://registry-1.docker.io/bitnamicharts/minio timeout: 240 diff --git a/tests/templates/kuttl/fault-tolerant-execution/02-install-trino.yaml.j2 b/tests/templates/kuttl/fault-tolerant-execution/02-install-trino.yaml.j2 index 955b3ea2..1cbc298e 100644 --- a/tests/templates/kuttl/fault-tolerant-execution/02-install-trino.yaml.j2 +++ b/tests/templates/kuttl/fault-tolerant-execution/02-install-trino.yaml.j2 @@ -18,13 +18,13 @@ spec: trino: trino-fte # Fault tolerant execution with S3/MinIO exchange manager faultTolerantExecution: - retryPolicy: Task - exchangeManager: - s3: - baseDirectories: - - "s3://exchange-bucket/" - connection: - reference: "minio" + task: + exchangeManager: + s3: + baseDirectories: + - "s3://exchange-bucket/" + connection: + reference: "minio" {% if lookup('env', 'VECTOR_AGGREGATOR') %} vectorAggregatorConfigMapName: vector-aggregator-discovery {% endif %} diff --git a/tests/templates/kuttl/fault-tolerant-execution/04-copy-scripts.yaml b/tests/templates/kuttl/fault-tolerant-execution/04-copy-scripts.yaml index aea8e8b6..c37cff38 100644 --- a/tests/templates/kuttl/fault-tolerant-execution/04-copy-scripts.yaml +++ b/tests/templates/kuttl/fault-tolerant-execution/04-copy-scripts.yaml @@ -2,4 +2,4 @@ apiVersion: kuttl.dev/v1beta1 kind: TestStep commands: - - script: kubectl cp -n $NAMESPACE check-fte.py trino-test-helper-0:/tmp/ + - script: kubectl cp -n $NAMESPACE 04_check-fte.py trino-test-helper-0:/tmp/ diff --git a/tests/templates/kuttl/fault-tolerant-execution/04_check-fte.py b/tests/templates/kuttl/fault-tolerant-execution/04_check-fte.py index f4ecb339..75ad8ec9 100644 --- a/tests/templates/kuttl/fault-tolerant-execution/04_check-fte.py +++ b/tests/templates/kuttl/fault-tolerant-execution/04_check-fte.py @@ -88,6 +88,8 @@ def get_connection(coordinator): print("Complex query returned no results") exit(-1) + print("Complex query test passed") + except Exception as e: print(f"Test failed with error: {e}") import traceback diff --git a/tests/templates/kuttl/fault-tolerant-execution/05-run-tests.yaml b/tests/templates/kuttl/fault-tolerant-execution/05-run-tests.yaml index 988161c7..08ffac17 100644 --- a/tests/templates/kuttl/fault-tolerant-execution/05-run-tests.yaml +++ b/tests/templates/kuttl/fault-tolerant-execution/05-run-tests.yaml @@ -2,10 +2,11 @@ apiVersion: kuttl.dev/v1beta1 kind: TestStep commands: - - script: kubectl exec -n $NAMESPACE trino-test-helper-0 -- python /tmp/check-fte.py -c trino-fte-coordinator -w 2 + - script: kubectl exec -n $NAMESPACE trino-test-helper-0 -- python /tmp/04_check-fte.py -c trino-fte-coordinator -w 2 timeout: 120 # Verify that the exchange bucket contains data - script: | + sleep 10 count=$(kubectl exec -n $NAMESPACE deployment/minio -- mc stat local/exchange-bucket | awk '/Objects count:/ {print $3}') if [ "$count" -gt 0 ]; then echo "Objects count is $count (> 0)" @@ -13,4 +14,4 @@ commands: echo "Objects count is $count (not > 0)" exit 1 fi - timeout: 20 + timeout: 30 From 2bd9e45d6eb647482216774cb6afecf0ebb527e8 Mon Sep 17 00:00:00 2001 From: dervoeti Date: Thu, 7 Aug 2025 16:05:22 +0200 Subject: [PATCH 13/26] feat: adapted graceful shutdown docs --- .../operations/graceful-shutdown.adoc | 8 +++--- .../src/operations/graceful_shutdown.rs | 27 ++++++++++++------- 2 files changed, 21 insertions(+), 14 deletions(-) diff --git a/docs/modules/trino/pages/usage-guide/operations/graceful-shutdown.adoc b/docs/modules/trino/pages/usage-guide/operations/graceful-shutdown.adoc index f24ce39e..a82ada2e 100644 --- a/docs/modules/trino/pages/usage-guide/operations/graceful-shutdown.adoc +++ b/docs/modules/trino/pages/usage-guide/operations/graceful-shutdown.adoc @@ -80,10 +80,12 @@ spec: All queries that take less than the minimal graceful shutdown period of all roleGroups (`1` hour as a default) are guaranteed to not be disturbed by regular termination of Pods. They can obviously still fail when, for example, a Kubernetes node dies or gets rebooted before it is fully drained. -Because of this, the operator automatically restricts the execution time of queries to the minimal graceful shutdown period of all roleGroups using the Trino configuration `query.max-execution-time=3600s`. +Because of this, the operator automatically restricts the execution time of queries to the minimal graceful shutdown period of all roleGroups using the Trino configuration `query.max-execution-time=3600s` when xref:usage-guide/fault-tolerant-execution.adoc[fault tolerant execution] is not configured. This causes all queries that take longer than 1 hour to fail with the error message `Query failed: Query exceeded the maximum execution time limit of 3600s.00s`. -In case you need to execute queries that take longer than the configured graceful shutdown period, you need to increase the `query.max-execution-time` property as follows: +However, when xref:usage-guide/fault-tolerant-execution.adoc[fault tolerant execution] is enabled, the `query.max-execution-time` restriction is not applied since queries can be automatically retried in case of failures, allowing them to run indefinitely without being cancelled by worker restarts. + +In case you need to execute queries that take longer than the configured graceful shutdown period and do not want to configure fault tolerant execution, you can increase the `query.max-execution-time` property as follows: [source,yaml] ---- @@ -95,8 +97,6 @@ spec: ---- Keep in mind, that queries taking longer than the graceful shutdown period are now subject to failure when a Trino worker gets shut down. -Running into this issue can be circumvented by using https://trino.io/docs/current/admin/fault-tolerant-execution.html[Fault-tolerant execution], which is not supported natively yet. -Until native support is added, you will have to use `configOverrides` to enable it. == Authorization requirements diff --git a/rust/operator-binary/src/operations/graceful_shutdown.rs b/rust/operator-binary/src/operations/graceful_shutdown.rs index 92c56a51..5b00bbab 100644 --- a/rust/operator-binary/src/operations/graceful_shutdown.rs +++ b/rust/operator-binary/src/operations/graceful_shutdown.rs @@ -27,16 +27,23 @@ pub fn graceful_shutdown_config_properties( ) -> BTreeMap> { match role { TrinoRole::Coordinator => { - let min_worker_graceful_shutdown_timeout = trino.min_worker_graceful_shutdown_timeout(); - // We know that queries taking longer than the minimum gracefulShutdownTimeout are subject to failure. - // Read operator docs for reasoning. - BTreeMap::from([( - "query.max-execution-time".to_string(), - Some(format!( - "{}s", - min_worker_graceful_shutdown_timeout.as_secs() - )), - )]) + // Only set query.max-execution-time if fault tolerant execution is not configured. + // With fault tolerant execution enabled, queries can be retried and run indefinitely. + if trino.spec.cluster_config.fault_tolerant_execution.is_none() { + let min_worker_graceful_shutdown_timeout = + trino.min_worker_graceful_shutdown_timeout(); + // We know that queries taking longer than the minimum gracefulShutdownTimeout are subject to failure. + // Read operator docs for reasoning. + BTreeMap::from([( + "query.max-execution-time".to_string(), + Some(format!( + "{}s", + min_worker_graceful_shutdown_timeout.as_secs() + )), + )]) + } else { + BTreeMap::new() + } } TrinoRole::Worker => BTreeMap::from([( "shutdown.grace-period".to_string(), From 3c45e01ef53a30dfbb9b2765b80ef638e3ad8e18 Mon Sep 17 00:00:00 2001 From: dervoeti Date: Tue, 12 Aug 2025 10:45:12 +0200 Subject: [PATCH 14/26] chore: add newlines after attributes --- .../src/crd/fault_tolerant_execution.rs | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/rust/operator-binary/src/crd/fault_tolerant_execution.rs b/rust/operator-binary/src/crd/fault_tolerant_execution.rs index 7e40e7f7..76a6ed5d 100644 --- a/rust/operator-binary/src/crd/fault_tolerant_execution.rs +++ b/rust/operator-binary/src/crd/fault_tolerant_execution.rs @@ -30,6 +30,7 @@ use crate::{ pub enum FaultTolerantExecutionConfig { /// Query-level fault tolerant execution. Retries entire queries on failure. Query(QueryRetryConfig), + /// Task-level fault tolerant execution. Retries individual tasks on failure (requires exchange manager). Task(TaskRetryConfig), } @@ -131,8 +132,10 @@ pub struct ExchangeManagerConfig { pub enum ExchangeManagerBackend { /// S3-compatible storage configuration. S3(S3ExchangeConfig), + /// HDFS-based exchange manager. Hdfs(HdfsExchangeConfig), + /// Local filesystem storage (not recommended for production). Local(LocalExchangeConfig), } @@ -142,18 +145,23 @@ pub enum ExchangeManagerBackend { pub struct S3ExchangeConfig { /// S3 bucket URIs for spooling data (e.g., s3://bucket1,s3://bucket2). pub base_directories: Vec, + /// S3 connection configuration. /// Learn more about S3 configuration in the [S3 concept docs](DOCS_BASE_URL_PLACEHOLDER/concepts/s3). pub connection: stackable_operator::crd::s3::v1alpha1::InlineConnectionOrReference, + /// IAM role to assume for S3 access. #[serde(skip_serializing_if = "Option::is_none")] pub iam_role: Option, + /// External ID for the IAM role trust policy. #[serde(skip_serializing_if = "Option::is_none")] pub external_id: Option, + /// Maximum number of times the S3 client should retry a request. #[serde(skip_serializing_if = "Option::is_none")] pub max_error_retries: Option, + /// Part data size for S3 multi-part upload. #[serde(skip_serializing_if = "Option::is_none")] pub upload_part_size: Option, @@ -164,11 +172,14 @@ pub struct S3ExchangeConfig { pub struct HdfsExchangeConfig { /// HDFS URIs for spooling data. pub base_directories: Vec, + /// HDFS connection configuration. pub hdfs: HdfsConnection, + /// Block data size for HDFS storage. #[serde(skip_serializing_if = "Option::is_none")] pub block_size: Option, + /// Skip directory scheme validation to support Hadoop-compatible file systems. #[serde(skip_serializing_if = "Option::is_none")] pub skip_directory_scheme_validation: Option, @@ -196,15 +207,20 @@ pub enum Error { pub struct ResolvedFaultTolerantExecutionConfig { /// Properties to add to config.properties pub config_properties: BTreeMap, + /// Properties to add to exchange-manager.properties (if needed) pub exchange_manager_properties: BTreeMap, + /// Volumes required for the configuration (e.g., for S3 credentials) pub volumes: Vec, + /// Volume mounts required for the configuration pub volume_mounts: Vec, + /// Env-Vars that should be exported from files. /// You can think of it like `export ="$(cat )"` pub load_env_from_files: BTreeMap, + /// Additional commands that need to be executed before starting Trino pub init_container_extra_start_commands: Vec, } From edac3f9c58b2eae9146e6eee6803a0c8545c048a Mon Sep 17 00:00:00 2001 From: dervoeti Date: Tue, 12 Aug 2025 14:26:08 +0200 Subject: [PATCH 15/26] chore: MinIO legacy charts and updated version --- .../usage-guide/fault-tolerant-execution.adoc | 2 +- .../01-install-minio.yaml | 2 +- .../01_helm-bitnami-minio-values.yaml | 25 +++++++++++++++++++ .../05-run-tests.yaml | 3 ++- 4 files changed, 29 insertions(+), 3 deletions(-) diff --git a/docs/modules/trino/pages/usage-guide/fault-tolerant-execution.adoc b/docs/modules/trino/pages/usage-guide/fault-tolerant-execution.adoc index e4722188..39aed758 100644 --- a/docs/modules/trino/pages/usage-guide/fault-tolerant-execution.adoc +++ b/docs/modules/trino/pages/usage-guide/fault-tolerant-execution.adoc @@ -204,7 +204,7 @@ Here's an example of a Trino cluster with fault-tolerant execution enabled using [source,bash] ---- stackablectl operator install commons secret listener trino -helm install minio minio --repo https://charts.bitnami.com/bitnami --version 15.0.7 --set auth.rootUser=minio-access-key --set auth.rootPassword=minio-secret-key --set tls.enabled=true --set tls.existingSecret=minio-tls-certificates --set provisioning.enabled=true --set provisioning.buckets[0].name=trino-exchange-bucket +helm install minio oci://registry-1.docker.io/bitnamicharts/minio --version 17.0.19 --set auth.rootUser=minio-access-key --set auth.rootPassword=minio-secret-key --set tls.enabled=true --set tls.server.existingSecret=minio-tls-certificates --set tls.existingSecret=minio-tls-certificates --set tls.existingCASecret=minio-tls-certificates --set tls.autoGenerated.enabled=false --set provisioning.enabled=true --set provisioning.buckets[0].name=trino-exchange-bucket --set global.security.allowInsecureImages=true --set image.repository=bitnamilegacy/minio --set clientImage.repository=bitnamilegacy/minio-client --set defaultInitContainers.volumePermissions.image.repository=bitnamilegacy/os-shell --set console.image.repository=bitnamilegacy/minio-object-browser ---- [source,yaml] diff --git a/tests/templates/kuttl/fault-tolerant-execution/01-install-minio.yaml b/tests/templates/kuttl/fault-tolerant-execution/01-install-minio.yaml index 6475d861..7a063b49 100644 --- a/tests/templates/kuttl/fault-tolerant-execution/01-install-minio.yaml +++ b/tests/templates/kuttl/fault-tolerant-execution/01-install-minio.yaml @@ -5,7 +5,7 @@ commands: - script: >- helm install minio --namespace $NAMESPACE - --version 15.0.7 + --version 17.0.19 -f 01_helm-bitnami-minio-values.yaml oci://registry-1.docker.io/bitnamicharts/minio timeout: 240 diff --git a/tests/templates/kuttl/fault-tolerant-execution/01_helm-bitnami-minio-values.yaml b/tests/templates/kuttl/fault-tolerant-execution/01_helm-bitnami-minio-values.yaml index 81c01ac9..367669e8 100644 --- a/tests/templates/kuttl/fault-tolerant-execution/01_helm-bitnami-minio-values.yaml +++ b/tests/templates/kuttl/fault-tolerant-execution/01_helm-bitnami-minio-values.yaml @@ -1,4 +1,21 @@ --- +global: + security: + allowInsecureImages: true + +image: + repository: bitnamilegacy/minio +clientImage: + repository: bitnamilegacy/minio-client +defaultInitContainers: + volumePermissions: # volumePermissions moved under defaultInitContainers starting with Chart version 17.0.0 + enabled: false + image: + repository: bitnamilegacy/os-shell +console: + image: + repository: bitnamilegacy/minio-object-browser + mode: standalone disableWebUI: false extraEnvVars: @@ -23,8 +40,11 @@ provisioning: containerSecurityContext: enabled: false +# volumePermissions can be removed starting with Chart version 17.0.0, moved under defaultInitContainers volumePermissions: enabled: false + image: + repository: bitnamilegacy/os-shell podSecurityContext: enabled: false @@ -51,4 +71,9 @@ service: tls: enabled: true + autoGenerated: + enabled: false + existingCASecret: minio-tls-certificates existingSecret: minio-tls-certificates + server: + existingSecret: minio-tls-certificates diff --git a/tests/templates/kuttl/fault-tolerant-execution/05-run-tests.yaml b/tests/templates/kuttl/fault-tolerant-execution/05-run-tests.yaml index 08ffac17..aa0e3601 100644 --- a/tests/templates/kuttl/fault-tolerant-execution/05-run-tests.yaml +++ b/tests/templates/kuttl/fault-tolerant-execution/05-run-tests.yaml @@ -7,7 +7,8 @@ commands: # Verify that the exchange bucket contains data - script: | sleep 10 - count=$(kubectl exec -n $NAMESPACE deployment/minio -- mc stat local/exchange-bucket | awk '/Objects count:/ {print $3}') + kubectl exec -n $NAMESPACE deployment/minio -- mc alias set local https://localhost:9000 minioAccessKey minioSecretKey --api S3v4 + count=$(kubectl exec -n $NAMESPACE deployment/minio -- mc stat --insecure local/exchange-bucket | awk '/Objects count:/ {print $3}') if [ "$count" -gt 0 ]; then echo "Objects count is $count (> 0)" else From 6c6a8979ef88a8dd739777ea1f2ac73c04845167 Mon Sep 17 00:00:00 2001 From: dervoeti Date: Wed, 13 Aug 2025 13:19:50 +0200 Subject: [PATCH 16/26] feat: use quantities instead of strings --- .../src/crd/fault_tolerant_execution.rs | 76 ++++++++++++------- rust/operator-binary/src/crd/mod.rs | 9 +++ 2 files changed, 58 insertions(+), 27 deletions(-) diff --git a/rust/operator-binary/src/crd/fault_tolerant_execution.rs b/rust/operator-binary/src/crd/fault_tolerant_execution.rs index 76a6ed5d..001ab58e 100644 --- a/rust/operator-binary/src/crd/fault_tolerant_execution.rs +++ b/rust/operator-binary/src/crd/fault_tolerant_execution.rs @@ -15,6 +15,7 @@ use stackable_operator::{ commons::tls_verification::{CaCert, TlsServerVerification, TlsVerification}, crd::s3, k8s_openapi::api::core::v1::{Volume, VolumeMount}, + k8s_openapi::apimachinery::pkg::api::resource::Quantity, schemars::{self, JsonSchema}, time::Duration, }; @@ -56,7 +57,7 @@ pub struct QueryRetryConfig { /// Data size of the coordinator's in-memory buffer used to store output of query stages. #[serde(skip_serializing_if = "Option::is_none")] - pub exchange_deduplication_buffer_size: Option, + pub exchange_deduplication_buffer_size: Option, /// Exchange manager configuration for spooling intermediate data during fault tolerant execution. /// Optional for Query retry policy, recommended for large result sets. @@ -85,7 +86,7 @@ pub struct TaskRetryConfig { /// Data size of the coordinator's in-memory buffer used to store output of query stages. #[serde(skip_serializing_if = "Option::is_none")] - pub exchange_deduplication_buffer_size: Option, + pub exchange_deduplication_buffer_size: Option, /// Exchange manager configuration for spooling intermediate data during fault tolerant execution. /// Required for Task retry policy. @@ -111,7 +112,7 @@ pub struct ExchangeManagerConfig { /// Max data size of files written by exchange sinks. #[serde(skip_serializing_if = "Option::is_none")] - pub sink_max_file_size: Option, + pub sink_max_file_size: Option, /// Number of concurrent readers to read from spooling storage. The larger the number of /// concurrent readers, the larger the read parallelism and memory usage. @@ -140,7 +141,7 @@ pub enum ExchangeManagerBackend { Local(LocalExchangeConfig), } -#[derive(Clone, Debug, Deserialize, Eq, JsonSchema, PartialEq, Serialize)] +#[derive(Clone, Debug, Deserialize, JsonSchema, PartialEq, Serialize)] #[serde(rename_all = "camelCase")] pub struct S3ExchangeConfig { /// S3 bucket URIs for spooling data (e.g., s3://bucket1,s3://bucket2). @@ -164,10 +165,10 @@ pub struct S3ExchangeConfig { /// Part data size for S3 multi-part upload. #[serde(skip_serializing_if = "Option::is_none")] - pub upload_part_size: Option, + pub upload_part_size: Option, } -#[derive(Clone, Debug, Deserialize, Eq, JsonSchema, PartialEq, Serialize)] +#[derive(Clone, Debug, Deserialize, JsonSchema, PartialEq, Serialize)] #[serde(rename_all = "camelCase")] pub struct HdfsExchangeConfig { /// HDFS URIs for spooling data. @@ -178,7 +179,7 @@ pub struct HdfsExchangeConfig { /// Block data size for HDFS storage. #[serde(skip_serializing_if = "Option::is_none")] - pub block_size: Option, + pub block_size: Option, /// Skip directory scheme validation to support Hadoop-compatible file systems. #[serde(skip_serializing_if = "Option::is_none")] @@ -201,6 +202,12 @@ pub enum Error { #[snafu(display("trino does not support disabling the TLS verification of S3 servers"))] S3TlsNoVerificationNotSupported, + + #[snafu(display("failed to convert data size for [{field}] to bytes"))] + QuantityConversion { + source: stackable_operator::memory::Error, + field: &'static str, + }, } /// Fault tolerant execution configuration with external resources resolved @@ -237,6 +244,21 @@ impl ResolvedFaultTolerantExecutionConfig { } } + /// Helper function to insert optional Quantity values after converting to Trino bytes string + fn insert_quantity_if_present( + properties: &mut BTreeMap, + key: &'static str, + quantity: Option<&Quantity>, + ) -> Result<(), Error> { + if let Some(q) = quantity { + use snafu::ResultExt; + let v = crate::crd::quantity_to_trino_bytes(q) + .context(QuantityConversionSnafu { field: key })?; + properties.insert(key.to_string(), v); + } + Ok(()) + } + /// Create a resolved fault tolerant execution configuration from the cluster config pub async fn from_config( config: &FaultTolerantExecutionConfig, @@ -275,11 +297,11 @@ impl ResolvedFaultTolerantExecutionConfig { "retry-delay-scale-factor", query_config.retry_delay_scale_factor.as_ref(), ); - Self::insert_if_present( + Self::insert_quantity_if_present( &mut config_properties, "exchange.deduplication-buffer-size", query_config.exchange_deduplication_buffer_size.as_ref(), - ); + )?; ("QUERY", query_config.exchange_manager.as_ref()) } @@ -311,11 +333,11 @@ impl ResolvedFaultTolerantExecutionConfig { "retry-delay-scale-factor", task_config.retry_delay_scale_factor.as_ref(), ); - Self::insert_if_present( + Self::insert_quantity_if_present( &mut config_properties, "exchange.deduplication-buffer-size", task_config.exchange_deduplication_buffer_size.as_ref(), - ); + )?; ("TASK", Some(&task_config.exchange_manager)) } @@ -340,11 +362,11 @@ impl ResolvedFaultTolerantExecutionConfig { "exchange.sink-buffers-per-partition", exchange_config.sink_buffers_per_partition, ); - Self::insert_if_present( + Self::insert_quantity_if_present( &mut exchange_manager_properties, "exchange.sink-max-file-size", exchange_config.sink_max_file_size.as_ref(), - ); + )?; Self::insert_if_present( &mut exchange_manager_properties, "exchange.source-concurrent-readers", @@ -378,11 +400,11 @@ impl ResolvedFaultTolerantExecutionConfig { "exchange.s3.max-error-retries", s3_config.max_error_retries, ); - Self::insert_if_present( + Self::insert_quantity_if_present( &mut exchange_manager_properties, "exchange.s3.upload.part-size", s3_config.upload_part_size.as_ref(), - ); + )?; } ExchangeManagerBackend::Hdfs(hdfs_config) => { exchange_manager_properties @@ -392,11 +414,11 @@ impl ResolvedFaultTolerantExecutionConfig { hdfs_config.base_directories.join(","), ); - Self::insert_if_present( + Self::insert_quantity_if_present( &mut exchange_manager_properties, "exchange.hdfs.block-size", hdfs_config.block_size.as_ref(), - ); + )?; Self::insert_if_present( &mut exchange_manager_properties, "exchange.hdfs.skip-directory-scheme-validation", @@ -562,7 +584,7 @@ mod tests { retry_initial_delay: Some(Duration::from_secs(15)), retry_max_delay: Some(Duration::from_secs(90)), retry_delay_scale_factor: Some(3.0), - exchange_deduplication_buffer_size: Some("64MB".to_string()), + exchange_deduplication_buffer_size: Some(Quantity("64Mi".to_string())), exchange_manager: None, }); @@ -595,7 +617,7 @@ mod tests { fte_config .config_properties .get("exchange.deduplication-buffer-size"), - Some(&"64MB".to_string()) + Some(&"67108864B".to_string()) ); } @@ -606,12 +628,12 @@ mod tests { retry_initial_delay: Some(Duration::from_secs(10)), retry_max_delay: Some(Duration::from_secs(60)), retry_delay_scale_factor: Some(2.0), - exchange_deduplication_buffer_size: Some("100MB".to_string()), + exchange_deduplication_buffer_size: Some(Quantity("100Mi".to_string())), exchange_manager: Some(ExchangeManagerConfig { encryption_enabled: Some(true), sink_buffer_pool_min_size: Some(10), sink_buffers_per_partition: Some(2), - sink_max_file_size: Some("1GB".to_string()), + sink_max_file_size: Some(Quantity("1Gi".to_string())), source_concurrent_readers: Some(4), backend: ExchangeManagerBackend::Local(LocalExchangeConfig { base_directories: vec!["/tmp/exchange".to_string()], @@ -662,7 +684,7 @@ mod tests { fte_config .config_properties .get("exchange.deduplication-buffer-size"), - Some(&"100MB".to_string()) + Some(&"104857600B".to_string()) ); assert_eq!( fte_config @@ -684,7 +706,7 @@ mod tests { encryption_enabled: None, sink_buffer_pool_min_size: Some(20), sink_buffers_per_partition: Some(4), - sink_max_file_size: Some("2GB".to_string()), + sink_max_file_size: Some(Quantity("2Gi".to_string())), source_concurrent_readers: Some(8), backend: ExchangeManagerBackend::S3(S3ExchangeConfig { base_directories: vec!["s3://my-bucket/exchange".to_string()], @@ -694,7 +716,7 @@ mod tests { iam_role: Some("arn:aws:iam::123456789012:role/TrinoRole".to_string()), external_id: Some("external-id-123".to_string()), max_error_retries: Some(5), - upload_part_size: Some("10MB".to_string()), + upload_part_size: Some(Quantity("10Mi".to_string())), }), config_overrides: std::collections::HashMap::new(), }, @@ -751,7 +773,7 @@ mod tests { fte_config .exchange_manager_properties .get("exchange.s3.upload.part-size"), - Some(&"10MB".to_string()) + Some(&"10485760B".to_string()) ); assert_eq!( fte_config @@ -769,7 +791,7 @@ mod tests { fte_config .exchange_manager_properties .get("exchange.sink-max-file-size"), - Some(&"2GB".to_string()) + Some(&"2147483648B".to_string()) ); assert_eq!( fte_config @@ -808,7 +830,7 @@ mod tests { iam_role: None, external_id: None, max_error_retries: None, - upload_part_size: Some("original-value".to_string()), + upload_part_size: Some(Quantity("10Mi".to_string())), }), config_overrides, }, diff --git a/rust/operator-binary/src/crd/mod.rs b/rust/operator-binary/src/crd/mod.rs index 9a73bca4..14cab937 100644 --- a/rust/operator-binary/src/crd/mod.rs +++ b/rust/operator-binary/src/crd/mod.rs @@ -137,6 +137,15 @@ pub const WORKER_SHUTDOWN_GRACE_PERIOD: Duration = Duration::from_secs(30); /// Safety puffer to guarantee the graceful shutdown works every time. pub const WORKER_GRACEFUL_SHUTDOWN_SAFETY_OVERHEAD: Duration = Duration::from_secs(10); +/// Convert a Kubernetes `Quantity` to a Trino property string in bytes, e.g. `"65536B"`. +pub(crate) fn quantity_to_trino_bytes( + q: &Quantity, +) -> Result { + let in_mebi = MemoryQuantity::try_from(q)?.scale_to(BinaryMultiple::Mebi); + let bytes = (in_mebi.value * 1024.0 * 1024.0).round() as u64; + Ok(format!("{bytes}B")) +} + #[derive(Snafu, Debug)] pub enum Error { #[snafu(display("object has no namespace associated"))] From 97d2d1f2e19e8be3319069a79c8bd95a385480d7 Mon Sep 17 00:00:00 2001 From: dervoeti Date: Wed, 13 Aug 2025 13:46:47 +0200 Subject: [PATCH 17/26] fix: moved to quantities in the FTE docs example --- .../examples/usage-guide/fault-tolerant-execution.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/modules/trino/examples/usage-guide/fault-tolerant-execution.yaml b/docs/modules/trino/examples/usage-guide/fault-tolerant-execution.yaml index 89125c0a..870406ef 100644 --- a/docs/modules/trino/examples/usage-guide/fault-tolerant-execution.yaml +++ b/docs/modules/trino/examples/usage-guide/fault-tolerant-execution.yaml @@ -16,12 +16,12 @@ spec: retryInitialDelay: 10s retryMaxDelay: 60s retryDelayScaleFactor: 2.0 - exchangeDeduplicationBufferSize: 64MB + exchangeDeduplicationBufferSize: 64Mi exchangeManager: encryptionEnabled: true sinkBufferPoolMinSize: 20 sinkBuffersPerPartition: 4 - sinkMaxFileSize: 2GB + sinkMaxFileSize: 2Gi sourceConcurrentReaders: 8 s3: baseDirectories: @@ -29,7 +29,7 @@ spec: connection: reference: minio-connection maxErrorRetries: 10 - uploadPartSize: 10MB + uploadPartSize: 10Mi coordinators: roleGroups: default: From fab0e45c165da98b6c275ab8afe1c9a99b4e2416 Mon Sep 17 00:00:00 2001 From: dervoeti Date: Wed, 13 Aug 2025 13:48:38 +0200 Subject: [PATCH 18/26] fix: moved to quantities in the FTE docs --- .../trino/pages/usage-guide/fault-tolerant-execution.adoc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/modules/trino/pages/usage-guide/fault-tolerant-execution.adoc b/docs/modules/trino/pages/usage-guide/fault-tolerant-execution.adoc index 39aed758..daa88934 100644 --- a/docs/modules/trino/pages/usage-guide/fault-tolerant-execution.adoc +++ b/docs/modules/trino/pages/usage-guide/fault-tolerant-execution.adoc @@ -24,8 +24,8 @@ The configuration uses a structured approach where you choose either `query` or A `query` retry policy instructs Trino to automatically retry a query in the event of an error occurring on a worker node. This policy is recommended when the majority of the Trino cluster's workload consists of many small queries. -By default, Trino does not implement fault tolerance for queries whose result set exceeds 32MB in size. -This limit can be increased by modifying the `exchangeDeduplicationBufferSize` configuration property to be greater than the default value of `32MB`, but this results in higher memory usage on the coordinator. +By default, Trino does not implement fault tolerance for queries whose result set exceeds 32Mi in size. +This limit can be increased by modifying the `exchangeDeduplicationBufferSize` configuration property to be greater than the default value of `32Mi`, but this results in higher memory usage on the coordinator. [source,yaml] ---- @@ -34,7 +34,7 @@ spec: faultTolerantExecution: query: retryAttempts: 3 - exchangeDeduplicationBufferSize: 64MB # Increased from default 32MB + exchangeDeduplicationBufferSize: 64Mi # Increased from default 32Mi ---- === Task retry policy From 096afe699cb94440a2b9cc4490d9fda30aac2f25 Mon Sep 17 00:00:00 2001 From: dervoeti Date: Wed, 13 Aug 2025 14:01:51 +0200 Subject: [PATCH 19/26] chore: cargo fmt --- rust/operator-binary/src/crd/fault_tolerant_execution.rs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/rust/operator-binary/src/crd/fault_tolerant_execution.rs b/rust/operator-binary/src/crd/fault_tolerant_execution.rs index 001ab58e..7db48e51 100644 --- a/rust/operator-binary/src/crd/fault_tolerant_execution.rs +++ b/rust/operator-binary/src/crd/fault_tolerant_execution.rs @@ -14,8 +14,10 @@ use stackable_operator::{ client::Client, commons::tls_verification::{CaCert, TlsServerVerification, TlsVerification}, crd::s3, - k8s_openapi::api::core::v1::{Volume, VolumeMount}, - k8s_openapi::apimachinery::pkg::api::resource::Quantity, + k8s_openapi::{ + api::core::v1::{Volume, VolumeMount}, + apimachinery::pkg::api::resource::Quantity, + }, schemars::{self, JsonSchema}, time::Duration, }; From b9f5ca9235f1937ca76c9f12764f1b65dd1ea4a4 Mon Sep 17 00:00:00 2001 From: dervoeti Date: Mon, 18 Aug 2025 10:17:44 +0200 Subject: [PATCH 20/26] chore: pre-commit fix --- .../trino/examples/usage-guide/fault-tolerant-execution.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/modules/trino/examples/usage-guide/fault-tolerant-execution.yaml b/docs/modules/trino/examples/usage-guide/fault-tolerant-execution.yaml index 870406ef..f84879d6 100644 --- a/docs/modules/trino/examples/usage-guide/fault-tolerant-execution.yaml +++ b/docs/modules/trino/examples/usage-guide/fault-tolerant-execution.yaml @@ -105,4 +105,3 @@ metadata: spec: connector: tpch: {} - From 8afac3105ef940a4bca80bb22c0c5c3d6978ca49 Mon Sep 17 00:00:00 2001 From: Lukas Krug Date: Mon, 18 Aug 2025 14:20:19 +0200 Subject: [PATCH 21/26] Update docs/modules/trino/pages/usage-guide/fault-tolerant-execution.adoc Co-authored-by: Malte Sander --- .../trino/pages/usage-guide/fault-tolerant-execution.adoc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/modules/trino/pages/usage-guide/fault-tolerant-execution.adoc b/docs/modules/trino/pages/usage-guide/fault-tolerant-execution.adoc index daa88934..2cad666b 100644 --- a/docs/modules/trino/pages/usage-guide/fault-tolerant-execution.adoc +++ b/docs/modules/trino/pages/usage-guide/fault-tolerant-execution.adoc @@ -3,7 +3,7 @@ :keywords: fault-tolerant execution, retry policy, exchange manager, spooling, query resilience Fault-tolerant execution is a mechanism in Trino that enables a cluster to mitigate query failures by retrying queries or their component tasks in the event of failure. -With fault-tolerant execution enabled, intermediate exchange data is spooled and can be re-used by another worker in the event of a worker outage or other fault during query execution. +With fault-tolerant execution enabled, intermediate exchange data is spooled and can be re-used by another worker in the event of a worker outage or other failures during query execution. By default, if a Trino node lacks the resources to execute a task or otherwise fails during query execution, the query fails and must be run again manually. The longer the runtime of a query, the more likely it is to be susceptible to such failures. From 30baa698114cd39b10864de3b0a3f8a2c131467c Mon Sep 17 00:00:00 2001 From: Lukas Krug Date: Mon, 18 Aug 2025 14:20:29 +0200 Subject: [PATCH 22/26] Update docs/modules/trino/pages/usage-guide/fault-tolerant-execution.adoc Co-authored-by: Malte Sander --- .../trino/pages/usage-guide/fault-tolerant-execution.adoc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/modules/trino/pages/usage-guide/fault-tolerant-execution.adoc b/docs/modules/trino/pages/usage-guide/fault-tolerant-execution.adoc index 2cad666b..c8a9436d 100644 --- a/docs/modules/trino/pages/usage-guide/fault-tolerant-execution.adoc +++ b/docs/modules/trino/pages/usage-guide/fault-tolerant-execution.adoc @@ -8,7 +8,7 @@ With fault-tolerant execution enabled, intermediate exchange data is spooled and By default, if a Trino node lacks the resources to execute a task or otherwise fails during query execution, the query fails and must be run again manually. The longer the runtime of a query, the more likely it is to be susceptible to such failures. -NOTE: Fault tolerance does not apply to broken queries or other user error. +NOTE: Fault tolerance does not apply to broken queries or other user errors. For example, Trino does not spend resources retrying a query that fails because its SQL cannot be parsed. Take a look at the link:https://trino.io/docs/current/admin/fault-tolerant-execution.html[Trino documentation for fault-tolerant execution {external-link-icon}^] to learn more. From 0d9e0661116dd63724b79908a3e64423963f201f Mon Sep 17 00:00:00 2001 From: Lukas Krug Date: Mon, 18 Aug 2025 14:20:52 +0200 Subject: [PATCH 23/26] Update docs/modules/trino/pages/usage-guide/fault-tolerant-execution.adoc Co-authored-by: Malte Sander --- .../trino/pages/usage-guide/fault-tolerant-execution.adoc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/modules/trino/pages/usage-guide/fault-tolerant-execution.adoc b/docs/modules/trino/pages/usage-guide/fault-tolerant-execution.adoc index c8a9436d..0d23154c 100644 --- a/docs/modules/trino/pages/usage-guide/fault-tolerant-execution.adoc +++ b/docs/modules/trino/pages/usage-guide/fault-tolerant-execution.adoc @@ -16,7 +16,7 @@ Take a look at the link:https://trino.io/docs/current/admin/fault-tolerant-execu == Configuration Fault-tolerant execution is not enabled by default. -To enable the feature, you need to configure it in your `TrinoCluster` resource by adding a `faultTolerantExecution` section to the cluster configuration. +It can be enabled in the `TrinoCluster` resource by adding a `faultTolerantExecution` section to the cluster configuration. The configuration uses a structured approach where you choose either `query` or `task` retry policy, each with their specific configuration options. === Query retry policy From 684083c4f9d1a053645f7b39c4af8747ed37f8ca Mon Sep 17 00:00:00 2001 From: Lukas Krug Date: Mon, 18 Aug 2025 14:21:06 +0200 Subject: [PATCH 24/26] Update docs/modules/trino/pages/usage-guide/fault-tolerant-execution.adoc Co-authored-by: Malte Sander --- .../trino/pages/usage-guide/fault-tolerant-execution.adoc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/modules/trino/pages/usage-guide/fault-tolerant-execution.adoc b/docs/modules/trino/pages/usage-guide/fault-tolerant-execution.adoc index 0d23154c..01100eb2 100644 --- a/docs/modules/trino/pages/usage-guide/fault-tolerant-execution.adoc +++ b/docs/modules/trino/pages/usage-guide/fault-tolerant-execution.adoc @@ -41,7 +41,7 @@ spec: A `task` retry policy instructs Trino to retry individual query tasks in the event of failure. You **must** configure an exchange manager to use the task retry policy. -This policy is recommended when executing large batch queries, as the cluster can more efficiently retry smaller tasks within the query rather than retry the whole query. +This policy is recommended when executing large batch queries, as the cluster can more efficiently retry smaller tasks within the query, rather than retry the whole query. IMPORTANT: A `task` retry policy is best suited for long-running queries, but this policy can result in higher latency for short-running queries executed in high volume. As a best practice, it is recommended to run a dedicated cluster with a `task` retry policy for large batch queries, separate from another cluster that handles short queries. From cef8eac6c1dff03f6e7d25d293e3391bd9258629 Mon Sep 17 00:00:00 2001 From: dervoeti Date: Mon, 18 Aug 2025 14:38:01 +0200 Subject: [PATCH 25/26] fix: integration test fixes --- .../04-copy-scripts.yaml | 1 + .../fault-tolerant-execution/04_check-fte.py | 28 ++----------------- .../05-run-tests.yaml | 4 +-- 3 files changed, 6 insertions(+), 27 deletions(-) diff --git a/tests/templates/kuttl/fault-tolerant-execution/04-copy-scripts.yaml b/tests/templates/kuttl/fault-tolerant-execution/04-copy-scripts.yaml index c37cff38..37c0eeb9 100644 --- a/tests/templates/kuttl/fault-tolerant-execution/04-copy-scripts.yaml +++ b/tests/templates/kuttl/fault-tolerant-execution/04-copy-scripts.yaml @@ -2,4 +2,5 @@ apiVersion: kuttl.dev/v1beta1 kind: TestStep commands: + - script: kubectl cp -n $NAMESPACE ../../../../templates/kuttl/commons/check-active-workers.py trino-test-helper-0:/tmp || true - script: kubectl cp -n $NAMESPACE 04_check-fte.py trino-test-helper-0:/tmp/ diff --git a/tests/templates/kuttl/fault-tolerant-execution/04_check-fte.py b/tests/templates/kuttl/fault-tolerant-execution/04_check-fte.py index 75ad8ec9..9daaf164 100644 --- a/tests/templates/kuttl/fault-tolerant-execution/04_check-fte.py +++ b/tests/templates/kuttl/fault-tolerant-execution/04_check-fte.py @@ -27,39 +27,14 @@ def get_connection(coordinator): required=True, help="Trino Coordinator Host to connect to", ) - all_args.add_argument( - "-w", - "--workers", - required=True, - help="Expected amount of workers to be present", - ) args = vars(all_args.parse_args()) - expected_workers = args["workers"] conn = get_connection(args["coordinator"]) try: cursor = conn.cursor() - # Check that workers are active - cursor.execute( - "SELECT COUNT(*) as nodes FROM system.runtime.nodes WHERE coordinator=false AND state='active'" - ) - (active_workers,) = cursor.fetchone() - - if int(active_workers) != int(expected_workers): - print( - "Mismatch: [expected/active] workers [" - + str(expected_workers) - + "/" - + str(active_workers) - + "]" - ) - exit(-1) - - print(f"Active workers check passed: {active_workers}/{expected_workers}") - # Test that TPCH connector is working cursor.execute("SELECT COUNT(*) FROM tpch.tiny.nation") result = cursor.fetchone() @@ -90,6 +65,9 @@ def get_connection(coordinator): print("Complex query test passed") + cursor.close() + conn.close() + except Exception as e: print(f"Test failed with error: {e}") import traceback diff --git a/tests/templates/kuttl/fault-tolerant-execution/05-run-tests.yaml b/tests/templates/kuttl/fault-tolerant-execution/05-run-tests.yaml index aa0e3601..d8f1e5c8 100644 --- a/tests/templates/kuttl/fault-tolerant-execution/05-run-tests.yaml +++ b/tests/templates/kuttl/fault-tolerant-execution/05-run-tests.yaml @@ -2,11 +2,11 @@ apiVersion: kuttl.dev/v1beta1 kind: TestStep commands: - - script: kubectl exec -n $NAMESPACE trino-test-helper-0 -- python /tmp/04_check-fte.py -c trino-fte-coordinator -w 2 + - script: kubectl exec -n $NAMESPACE trino-test-helper-0 -- python /tmp/check-active-workers.py -u admin -p "" -c trino-fte-coordinator -w 2 + - script: kubectl exec -n $NAMESPACE trino-test-helper-0 -- python /tmp/04_check-fte.py -c trino-fte-coordinator timeout: 120 # Verify that the exchange bucket contains data - script: | - sleep 10 kubectl exec -n $NAMESPACE deployment/minio -- mc alias set local https://localhost:9000 minioAccessKey minioSecretKey --api S3v4 count=$(kubectl exec -n $NAMESPACE deployment/minio -- mc stat --insecure local/exchange-bucket | awk '/Objects count:/ {print $3}') if [ "$count" -gt 0 ]; then From f4a438a66bca7238aaf9e01c7a56369b00d24258 Mon Sep 17 00:00:00 2001 From: dervoeti Date: Mon, 18 Aug 2025 15:56:44 +0200 Subject: [PATCH 26/26] fix: integration test fixes --- .../kuttl/fault-tolerant-execution/05-assert.yaml | 12 ++++++++++++ .../kuttl/fault-tolerant-execution/05-run-tests.yaml | 12 ------------ 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/tests/templates/kuttl/fault-tolerant-execution/05-assert.yaml b/tests/templates/kuttl/fault-tolerant-execution/05-assert.yaml index 615e91b2..36e1c7e3 100644 --- a/tests/templates/kuttl/fault-tolerant-execution/05-assert.yaml +++ b/tests/templates/kuttl/fault-tolerant-execution/05-assert.yaml @@ -2,6 +2,18 @@ apiVersion: kuttl.dev/v1beta1 kind: TestAssert timeout: 300 +commands: + - script: kubectl exec -n $NAMESPACE trino-test-helper-0 -- python /tmp/check-active-workers.py -u admin -p "" -c trino-fte-coordinator -w 2 + # Verify that the exchange bucket contains data + - script: | + kubectl exec -n $NAMESPACE deployment/minio -- mc alias set local https://localhost:9000 minioAccessKey minioSecretKey --api S3v4 + count=$(kubectl exec -n $NAMESPACE deployment/minio -- mc stat --insecure local/exchange-bucket | awk '/Objects count:/ {print $3}') + if [ "$count" -gt 0 ]; then + echo "Objects count is $count (> 0)" + else + echo "Objects count is $count (not > 0)" + exit 1 + fi --- apiVersion: apps/v1 kind: StatefulSet diff --git a/tests/templates/kuttl/fault-tolerant-execution/05-run-tests.yaml b/tests/templates/kuttl/fault-tolerant-execution/05-run-tests.yaml index d8f1e5c8..68339612 100644 --- a/tests/templates/kuttl/fault-tolerant-execution/05-run-tests.yaml +++ b/tests/templates/kuttl/fault-tolerant-execution/05-run-tests.yaml @@ -2,17 +2,5 @@ apiVersion: kuttl.dev/v1beta1 kind: TestStep commands: - - script: kubectl exec -n $NAMESPACE trino-test-helper-0 -- python /tmp/check-active-workers.py -u admin -p "" -c trino-fte-coordinator -w 2 - script: kubectl exec -n $NAMESPACE trino-test-helper-0 -- python /tmp/04_check-fte.py -c trino-fte-coordinator timeout: 120 - # Verify that the exchange bucket contains data - - script: | - kubectl exec -n $NAMESPACE deployment/minio -- mc alias set local https://localhost:9000 minioAccessKey minioSecretKey --api S3v4 - count=$(kubectl exec -n $NAMESPACE deployment/minio -- mc stat --insecure local/exchange-bucket | awk '/Objects count:/ {print $3}') - if [ "$count" -gt 0 ]; then - echo "Objects count is $count (> 0)" - else - echo "Objects count is $count (not > 0)" - exit 1 - fi - timeout: 30