From 2298e3a7c3887fc0a042bfa5d43b83c7e5ab0d8f Mon Sep 17 00:00:00 2001 From: xeniape Date: Thu, 16 Oct 2025 16:13:32 +0200 Subject: [PATCH 01/18] WIP: adds metrics and native-metrics service, adds metrics port to container, adds prometheus annotations --- rust/operator-binary/src/container.rs | 6 +- rust/operator-binary/src/crd/constants.rs | 6 + rust/operator-binary/src/crd/mod.rs | 120 +++++++++++-- rust/operator-binary/src/hdfs_controller.rs | 183 +++++++++++++++++++- 4 files changed, 287 insertions(+), 28 deletions(-) diff --git a/rust/operator-binary/src/container.rs b/rust/operator-binary/src/container.rs index 4110b48c..f23179db 100644 --- a/rust/operator-binary/src/container.rs +++ b/rust/operator-binary/src/container.rs @@ -68,7 +68,7 @@ use crate::{ NAMENODE_ROOT_DATA_DIR, READINESS_PROBE_FAILURE_THRESHOLD, READINESS_PROBE_INITIAL_DELAY_SECONDS, READINESS_PROBE_PERIOD_SECONDS, SERVICE_PORT_NAME_HTTP, SERVICE_PORT_NAME_HTTPS, SERVICE_PORT_NAME_IPC, - SERVICE_PORT_NAME_RPC, STACKABLE_ROOT_DATA_DIR, + SERVICE_PORT_NAME_METRICS, SERVICE_PORT_NAME_RPC, STACKABLE_ROOT_DATA_DIR, }, storage::DataNodeStorageConfig, v1alpha1, @@ -488,7 +488,9 @@ impl ContainerConfig { )?) .add_volume_mounts(self.volume_mounts(hdfs, merged_config, labels)?) .context(AddVolumeMountSnafu)? - .add_container_ports(self.container_ports(hdfs)); + .add_container_ports(self.container_ports(hdfs)) + // TODO: This currently adds the metrics port also to the zkfc containers, not needed there? + .add_container_port(SERVICE_PORT_NAME_METRICS, hdfs.metrics_port(role).into()); if let Some(resources) = resources { cb.resources(resources); diff --git a/rust/operator-binary/src/crd/constants.rs b/rust/operator-binary/src/crd/constants.rs index 876da8f3..37857b6d 100644 --- a/rust/operator-binary/src/crd/constants.rs +++ b/rust/operator-binary/src/crd/constants.rs @@ -24,17 +24,23 @@ pub const SERVICE_PORT_NAME_METRICS: &str = "metrics"; pub const DEFAULT_LISTENER_CLASS: &str = "cluster-internal"; pub const DEFAULT_NAME_NODE_METRICS_PORT: u16 = 8183; +pub const DEFAULT_NAME_NODE_NATIVE_METRICS_HTTP_PORT: u16 = 9870; +pub const DEFAULT_NAME_NODE_NATIVE_METRICS_HTTPS_PORT: u16 = 9871; pub const DEFAULT_NAME_NODE_HTTP_PORT: u16 = 9870; pub const DEFAULT_NAME_NODE_HTTPS_PORT: u16 = 9871; pub const DEFAULT_NAME_NODE_RPC_PORT: u16 = 8020; pub const DEFAULT_DATA_NODE_METRICS_PORT: u16 = 8082; +pub const DEFAULT_DATA_NODE_NATIVE_METRICS_HTTP_PORT: u16 = 9864; +pub const DEFAULT_DATA_NODE_NATIVE_METRICS_HTTPS_PORT: u16 = 9865; pub const DEFAULT_DATA_NODE_HTTP_PORT: u16 = 9864; pub const DEFAULT_DATA_NODE_HTTPS_PORT: u16 = 9865; pub const DEFAULT_DATA_NODE_DATA_PORT: u16 = 9866; pub const DEFAULT_DATA_NODE_IPC_PORT: u16 = 9867; pub const DEFAULT_JOURNAL_NODE_METRICS_PORT: u16 = 8081; +pub const DEFAULT_JOURNAL_NODE_NATIVE_METRICS_HTTP_PORT: u16 = 8480; +pub const DEFAULT_JOURNAL_NODE_NATIVE_METRICS_HTTPS_PORT: u16 = 8481; pub const DEFAULT_JOURNAL_NODE_HTTP_PORT: u16 = 8480; pub const DEFAULT_JOURNAL_NODE_HTTPS_PORT: u16 = 8481; pub const DEFAULT_JOURNAL_NODE_RPC_PORT: u16 = 8485; diff --git a/rust/operator-binary/src/crd/mod.rs b/rust/operator-binary/src/crd/mod.rs index 206509fb..b1a29c43 100644 --- a/rust/operator-binary/src/crd/mod.rs +++ b/rust/operator-binary/src/crd/mod.rs @@ -54,15 +54,18 @@ use crate::crd::{ APP_NAME, CORE_SITE_XML, DEFAULT_DATA_NODE_DATA_PORT, DEFAULT_DATA_NODE_GRACEFUL_SHUTDOWN_TIMEOUT, DEFAULT_DATA_NODE_HTTP_PORT, DEFAULT_DATA_NODE_HTTPS_PORT, DEFAULT_DATA_NODE_IPC_PORT, DEFAULT_DATA_NODE_METRICS_PORT, + DEFAULT_DATA_NODE_NATIVE_METRICS_HTTP_PORT, DEFAULT_DATA_NODE_NATIVE_METRICS_HTTPS_PORT, DEFAULT_DFS_REPLICATION_FACTOR, DEFAULT_JOURNAL_NODE_GRACEFUL_SHUTDOWN_TIMEOUT, DEFAULT_JOURNAL_NODE_HTTP_PORT, DEFAULT_JOURNAL_NODE_HTTPS_PORT, - DEFAULT_JOURNAL_NODE_METRICS_PORT, DEFAULT_JOURNAL_NODE_RPC_PORT, DEFAULT_LISTENER_CLASS, - DEFAULT_NAME_NODE_GRACEFUL_SHUTDOWN_TIMEOUT, DEFAULT_NAME_NODE_HTTP_PORT, - DEFAULT_NAME_NODE_HTTPS_PORT, DEFAULT_NAME_NODE_METRICS_PORT, DEFAULT_NAME_NODE_RPC_PORT, - DFS_REPLICATION, HADOOP_POLICY_XML, HDFS_SITE_XML, JVM_SECURITY_PROPERTIES_FILE, - LISTENER_VOLUME_NAME, SERVICE_PORT_NAME_DATA, SERVICE_PORT_NAME_HTTP, - SERVICE_PORT_NAME_HTTPS, SERVICE_PORT_NAME_IPC, SERVICE_PORT_NAME_METRICS, - SERVICE_PORT_NAME_RPC, SSL_CLIENT_XML, SSL_SERVER_XML, + DEFAULT_JOURNAL_NODE_METRICS_PORT, DEFAULT_JOURNAL_NODE_NATIVE_METRICS_HTTP_PORT, + DEFAULT_JOURNAL_NODE_NATIVE_METRICS_HTTPS_PORT, DEFAULT_JOURNAL_NODE_RPC_PORT, + DEFAULT_LISTENER_CLASS, DEFAULT_NAME_NODE_GRACEFUL_SHUTDOWN_TIMEOUT, + DEFAULT_NAME_NODE_HTTP_PORT, DEFAULT_NAME_NODE_HTTPS_PORT, DEFAULT_NAME_NODE_METRICS_PORT, + DEFAULT_NAME_NODE_NATIVE_METRICS_HTTP_PORT, DEFAULT_NAME_NODE_NATIVE_METRICS_HTTPS_PORT, + DEFAULT_NAME_NODE_RPC_PORT, DFS_REPLICATION, HADOOP_POLICY_XML, HDFS_SITE_XML, + JVM_SECURITY_PROPERTIES_FILE, LISTENER_VOLUME_NAME, SERVICE_PORT_NAME_DATA, + SERVICE_PORT_NAME_HTTP, SERVICE_PORT_NAME_HTTPS, SERVICE_PORT_NAME_IPC, + SERVICE_PORT_NAME_METRICS, SERVICE_PORT_NAME_RPC, SSL_CLIENT_XML, SSL_SERVER_XML, }, security::{AuthenticationConfig, KerberosConfig}, storage::{ @@ -671,10 +674,6 @@ impl v1alpha1::HdfsCluster { pub fn ports(&self, role: &HdfsNodeRole) -> Vec<(String, u16)> { match role { HdfsNodeRole::Name => vec![ - ( - String::from(SERVICE_PORT_NAME_METRICS), - DEFAULT_NAME_NODE_METRICS_PORT, - ), ( String::from(SERVICE_PORT_NAME_RPC), DEFAULT_NAME_NODE_RPC_PORT, @@ -692,10 +691,6 @@ impl v1alpha1::HdfsCluster { }, ], HdfsNodeRole::Data => vec![ - ( - String::from(SERVICE_PORT_NAME_METRICS), - DEFAULT_DATA_NODE_METRICS_PORT, - ), ( String::from(SERVICE_PORT_NAME_DATA), DEFAULT_DATA_NODE_DATA_PORT, @@ -717,10 +712,6 @@ impl v1alpha1::HdfsCluster { }, ], HdfsNodeRole::Journal => vec![ - ( - String::from(SERVICE_PORT_NAME_METRICS), - DEFAULT_JOURNAL_NODE_METRICS_PORT, - ), ( String::from(SERVICE_PORT_NAME_RPC), DEFAULT_JOURNAL_NODE_RPC_PORT, @@ -739,6 +730,97 @@ impl v1alpha1::HdfsCluster { ], } } + + /// Returns required metrics port name and metrics port number tuples depending on the role. + pub fn metrics_ports(&self, role: &HdfsNodeRole) -> Vec<(String, u16)> { + match role { + HdfsNodeRole::Name => vec![( + String::from(SERVICE_PORT_NAME_METRICS), + DEFAULT_NAME_NODE_METRICS_PORT, + )], + HdfsNodeRole::Data => vec![( + String::from(SERVICE_PORT_NAME_METRICS), + DEFAULT_DATA_NODE_METRICS_PORT, + )], + HdfsNodeRole::Journal => vec![( + String::from(SERVICE_PORT_NAME_METRICS), + DEFAULT_JOURNAL_NODE_METRICS_PORT, + )], + } + } + + /// Returns required metrics port name and native metrics port number tuples depending on the role. + pub fn native_metrics_ports(&self, role: &HdfsNodeRole) -> Vec<(String, u16)> { + match role { + HdfsNodeRole::Name => vec![if self.has_https_enabled() { + ( + String::from(SERVICE_PORT_NAME_METRICS), + DEFAULT_NAME_NODE_NATIVE_METRICS_HTTPS_PORT, + ) + } else { + ( + String::from(SERVICE_PORT_NAME_METRICS), + DEFAULT_NAME_NODE_NATIVE_METRICS_HTTP_PORT, + ) + }], + HdfsNodeRole::Data => vec![if self.has_https_enabled() { + ( + String::from(SERVICE_PORT_NAME_METRICS), + DEFAULT_DATA_NODE_NATIVE_METRICS_HTTPS_PORT, + ) + } else { + ( + String::from(SERVICE_PORT_NAME_METRICS), + DEFAULT_DATA_NODE_NATIVE_METRICS_HTTP_PORT, + ) + }], + HdfsNodeRole::Journal => vec![if self.has_https_enabled() { + ( + String::from(SERVICE_PORT_NAME_METRICS), + DEFAULT_JOURNAL_NODE_NATIVE_METRICS_HTTPS_PORT, + ) + } else { + ( + String::from(SERVICE_PORT_NAME_METRICS), + DEFAULT_JOURNAL_NODE_NATIVE_METRICS_HTTP_PORT, + ) + }], + } + } + + pub fn metrics_port(&self, role: &HdfsNodeRole) -> u16 { + match role { + HdfsNodeRole::Name => DEFAULT_NAME_NODE_METRICS_PORT, + HdfsNodeRole::Data => DEFAULT_DATA_NODE_METRICS_PORT, + HdfsNodeRole::Journal => DEFAULT_JOURNAL_NODE_METRICS_PORT, + } + } + + pub fn native_metrics_port(&self, role: &HdfsNodeRole) -> u16 { + match role { + HdfsNodeRole::Name => { + if self.has_https_enabled() { + DEFAULT_NAME_NODE_NATIVE_METRICS_HTTPS_PORT + } else { + DEFAULT_NAME_NODE_NATIVE_METRICS_HTTP_PORT + } + } + HdfsNodeRole::Data => { + if self.has_https_enabled() { + DEFAULT_DATA_NODE_NATIVE_METRICS_HTTPS_PORT + } else { + DEFAULT_DATA_NODE_NATIVE_METRICS_HTTP_PORT + } + } + HdfsNodeRole::Journal => { + if self.has_https_enabled() { + DEFAULT_JOURNAL_NODE_NATIVE_METRICS_HTTPS_PORT + } else { + DEFAULT_JOURNAL_NODE_NATIVE_METRICS_HTTP_PORT + } + } + } + } } #[derive(Clone, Debug, Deserialize, Eq, Hash, JsonSchema, PartialEq, Serialize)] diff --git a/rust/operator-binary/src/hdfs_controller.rs b/rust/operator-binary/src/hdfs_controller.rs index 0eea1e6d..a1d2267e 100644 --- a/rust/operator-binary/src/hdfs_controller.rs +++ b/rust/operator-binary/src/hdfs_controller.rs @@ -37,7 +37,7 @@ use stackable_operator::{ core::{DeserializeGuard, error_boundary}, runtime::{controller::Action, events::Recorder, reflector::ObjectRef}, }, - kvp::{Label, LabelError, Labels}, + kvp::{Annotations, Label, LabelError, Labels}, logging::controller::ReconcilerError, product_config_utils::{transform_all_roles_to_config, validate_all_roles_and_groups_config}, role_utils::{GenericRoleConfig, RoleGroupRef}, @@ -413,6 +413,15 @@ pub async fn reconcile_hdfs( let rg_service = rolegroup_service(hdfs, metadata, &role, &rolegroup_ref)?; + let rg_metrics_service = + rolegroup_metrics_service(hdfs, &role, &rolegroup_ref, &resolved_product_image)?; + let rg_native_metrics_service = rolegroup_native_metrics_service( + hdfs, + &role, + &rolegroup_ref, + &resolved_product_image, + )?; + let rg_configmap = rolegroup_config_map( hdfs, &client.kubernetes_cluster_info, @@ -439,12 +448,27 @@ pub async fn reconcile_hdfs( )?; let rg_service_name = rg_service.name_any(); + let rg_metrics_service_name = rg_metrics_service.name_any(); + let rg_native_metrics_service_name = rg_native_metrics_service.name_any(); + cluster_resources .add(client, rg_service) .await .with_context(|_| ApplyRoleGroupServiceSnafu { name: rg_service_name, })?; + cluster_resources + .add(client, rg_metrics_service) + .await + .with_context(|_| ApplyRoleGroupServiceSnafu { + name: rg_metrics_service_name, + })?; + cluster_resources + .add(client, rg_native_metrics_service) + .await + .with_context(|_| ApplyRoleGroupServiceSnafu { + name: rg_native_metrics_service_name, + })?; let rg_configmap_name = rg_configmap.name_any(); cluster_resources .add(client, rg_configmap.clone()) @@ -568,11 +592,6 @@ fn rolegroup_service( ) -> HdfsOperatorResult { tracing::info!("Setting up Service for {:?}", rolegroup_ref); - let prometheus_label = - Label::try_from(("prometheus.io/scrape", "true")).context(BuildPrometheusLabelSnafu)?; - let mut metadata_with_prometheus_label = metadata.clone(); - metadata_with_prometheus_label.with_label(prometheus_label); - let service_spec = ServiceSpec { // Internal communication does not need to be exposed type_: Some("ClusterIP".to_string()), @@ -598,7 +617,157 @@ fn rolegroup_service( }; Ok(Service { - metadata: metadata_with_prometheus_label.build(), + metadata: metadata.build(), + spec: Some(service_spec), + status: None, + }) +} + +fn rolegroup_metrics_service( + hdfs: &v1alpha1::HdfsCluster, + role: &HdfsNodeRole, + rolegroup_ref: &RoleGroupRef, + resolved_product_image: &ResolvedProductImage, +) -> HdfsOperatorResult { + tracing::info!("Setting up metrics Service for {:?}", rolegroup_ref); + + let service_spec = ServiceSpec { + // Internal communication does not need to be exposed + type_: Some("ClusterIP".to_string()), + cluster_ip: Some("None".to_string()), + ports: Some( + hdfs.metrics_ports(role) + .into_iter() + .map(|(name, value)| ServicePort { + name: Some(name), + port: i32::from(value), + protocol: Some("TCP".to_string()), + ..ServicePort::default() + }) + .collect(), + ), + selector: Some( + hdfs.rolegroup_selector_labels(rolegroup_ref) + .context(RoleGroupSelectorLabelsSnafu)? + .into(), + ), + publish_not_ready_addresses: Some(true), + ..ServiceSpec::default() + }; + + Ok(Service { + metadata: ObjectMetaBuilder::new() + .name_and_namespace(hdfs) + .name(rolegroup_ref.rolegroup_metrics_service_name()) + .ownerreference_from_resource(hdfs, None, Some(true)) + .with_context(|_| ObjectMissingMetadataForOwnerRefSnafu { + obj_ref: ObjectRef::from_obj(hdfs), + })? + .with_recommended_labels(build_recommended_labels( + hdfs, + RESOURCE_MANAGER_HDFS_CONTROLLER, + &resolved_product_image.app_version_label_value, + &rolegroup_ref.role, + &rolegroup_ref.role_group, + )) + .context(ObjectMetaSnafu)? + .with_label( + Label::try_from(("prometheus.io/scrape", "true")) + .context(BuildPrometheusLabelSnafu)?, + ) + .with_annotations( + Annotations::try_from([ + ("prometheus.io/path".to_owned(), "/metrics".to_owned()), + ( + "prometheus.io/port".to_owned(), + hdfs.metrics_port(role).to_string(), + ), + ("prometheus.io/scheme".to_owned(), "http".to_owned()), + ("prometheus.io/scrape".to_owned(), "true".to_owned()), + ]) + .expect("should be valid annotations"), + ) + .build(), + spec: Some(service_spec), + status: None, + }) +} + +fn rolegroup_native_metrics_service( + hdfs: &v1alpha1::HdfsCluster, + role: &HdfsNodeRole, + rolegroup_ref: &RoleGroupRef, + resolved_product_image: &ResolvedProductImage, +) -> HdfsOperatorResult { + tracing::info!("Setting up native metrics Service for {:?}", rolegroup_ref); + + let service_spec = ServiceSpec { + // Internal communication does not need to be exposed + type_: Some("ClusterIP".to_string()), + cluster_ip: Some("None".to_string()), + ports: Some( + hdfs.native_metrics_ports(role) + .into_iter() + .map(|(name, value)| ServicePort { + name: Some(name), + port: i32::from(value), + protocol: Some("TCP".to_string()), + ..ServicePort::default() + }) + .collect(), + ), + selector: Some( + hdfs.rolegroup_selector_labels(rolegroup_ref) + .context(RoleGroupSelectorLabelsSnafu)? + .into(), + ), + publish_not_ready_addresses: Some(true), + ..ServiceSpec::default() + }; + + Ok(Service { + metadata: ObjectMetaBuilder::new() + .name_and_namespace(hdfs) + .name(format!( + "{name}-native-metrics", + name = rolegroup_ref.object_name() + )) + .ownerreference_from_resource(hdfs, None, Some(true)) + .with_context(|_| ObjectMissingMetadataForOwnerRefSnafu { + obj_ref: ObjectRef::from_obj(hdfs), + })? + .with_recommended_labels(build_recommended_labels( + hdfs, + RESOURCE_MANAGER_HDFS_CONTROLLER, + &resolved_product_image.app_version_label_value, + &rolegroup_ref.role, + &rolegroup_ref.role_group, + )) + .context(ObjectMetaSnafu)? + .with_label( + Label::try_from(("prometheus.io/scrape", "true")) + .context(BuildPrometheusLabelSnafu)?, + ) + .with_annotations( + Annotations::try_from([ + ("prometheus.io/path".to_owned(), "/prom".to_owned()), + ( + "prometheus.io/port".to_owned(), + hdfs.native_metrics_port(role).to_string(), + ), + ( + "prometheus.io/scheme".to_owned(), + if hdfs.has_https_enabled() { + "https".to_owned() + } else { + "http".to_owned() + }, + ), + ("prometheus.io/scrape".to_owned(), "true".to_owned()), + ]) + .expect("should be valid annotations"), + ) + .build(), spec: Some(service_spec), status: None, }) From d572fa3fe9ba16a2be6308cba572723862ac0869 Mon Sep 17 00:00:00 2001 From: Malte Sander Date: Tue, 21 Oct 2025 14:58:44 +0200 Subject: [PATCH 02/18] move services to own module --- rust/operator-binary/src/hdfs_controller.rs | 236 ++----------------- rust/operator-binary/src/main.rs | 1 + rust/operator-binary/src/service.rs | 244 ++++++++++++++++++++ 3 files changed, 270 insertions(+), 211 deletions(-) create mode 100644 rust/operator-binary/src/service.rs diff --git a/rust/operator-binary/src/hdfs_controller.rs b/rust/operator-binary/src/hdfs_controller.rs index a1d2267e..ce178b0f 100644 --- a/rust/operator-binary/src/hdfs_controller.rs +++ b/rust/operator-binary/src/hdfs_controller.rs @@ -27,7 +27,7 @@ use stackable_operator::{ DeepMerge, api::{ apps::v1::{StatefulSet, StatefulSetSpec}, - core::v1::{ConfigMap, Service, ServiceAccount, ServicePort, ServiceSpec}, + core::v1::{ConfigMap, ServiceAccount}, }, apimachinery::pkg::apis::meta::v1::LabelSelector, }, @@ -37,7 +37,7 @@ use stackable_operator::{ core::{DeserializeGuard, error_boundary}, runtime::{controller::Action, events::Recorder, reflector::ObjectRef}, }, - kvp::{Annotations, Label, LabelError, Labels}, + kvp::{LabelError, Labels}, logging::controller::ReconcilerError, product_config_utils::{transform_all_roles_to_config, validate_all_roles_and_groups_config}, role_utils::{GenericRoleConfig, RoleGroupRef}, @@ -69,6 +69,10 @@ use crate::{ }, product_logging::extend_role_group_config_map, security::{self, kerberos, opa::HdfsOpaConfig}, + service::{ + self, rolegroup_headless_service, rolegroup_metrics_service, + rolegroup_native_metrics_service, + }, }; pub const RESOURCE_MANAGER_HDFS_CONTROLLER: &str = "hdfs-operator-hdfs-controller"; @@ -218,15 +222,9 @@ pub enum Error { #[snafu(display("failed to build roleGroup selector labels"))] RoleGroupSelectorLabels { source: crate::crd::Error }, - #[snafu(display("failed to build prometheus label"))] - BuildPrometheusLabel { source: LabelError }, - #[snafu(display("failed to build cluster resources label"))] BuildClusterResourcesLabel { source: LabelError }, - #[snafu(display("failed to build role-group selector label"))] - BuildRoleGroupSelectorLabel { source: LabelError }, - #[snafu(display("failed to build role-group volume claim templates from config"))] BuildRoleGroupVolumeClaimTemplates { source: container::Error }, @@ -250,6 +248,9 @@ pub enum Error { ResolveProductImage { source: product_image_selection::Error, }, + + #[snafu(display("failed to builds service"))] + BuildService { source: service::Error }, } impl ReconcilerError for Error { @@ -392,6 +393,20 @@ pub async fn reconcile_hdfs( let rolegroup_ref = hdfs.rolegroup_ref(role_name, rolegroup_name); + let rg_service = + rolegroup_headless_service(hdfs, &role, &rolegroup_ref, &resolved_product_image) + .context(BuildServiceSnafu)?; + let rg_metrics_service = + rolegroup_metrics_service(hdfs, &role, &rolegroup_ref, &resolved_product_image) + .context(BuildServiceSnafu)?; + let rg_native_metrics_service = rolegroup_native_metrics_service( + hdfs, + &role, + &rolegroup_ref, + &resolved_product_image, + ) + .context(BuildServiceSnafu)?; + // We need to split the creation and the usage of the "metadata" variable in two statements. // to avoid the compiler error "E0716 (temporary value dropped while borrowed)". let mut metadata = ObjectMetaBuilder::new(); @@ -411,17 +426,6 @@ pub async fn reconcile_hdfs( )) .context(ObjectMetaSnafu)?; - let rg_service = rolegroup_service(hdfs, metadata, &role, &rolegroup_ref)?; - - let rg_metrics_service = - rolegroup_metrics_service(hdfs, &role, &rolegroup_ref, &resolved_product_image)?; - let rg_native_metrics_service = rolegroup_native_metrics_service( - hdfs, - &role, - &rolegroup_ref, - &resolved_product_image, - )?; - let rg_configmap = rolegroup_config_map( hdfs, &client.kubernetes_cluster_info, @@ -584,195 +588,6 @@ pub async fn reconcile_hdfs( Ok(Action::await_change()) } -fn rolegroup_service( - hdfs: &v1alpha1::HdfsCluster, - metadata: &ObjectMetaBuilder, - role: &HdfsNodeRole, - rolegroup_ref: &RoleGroupRef, -) -> HdfsOperatorResult { - tracing::info!("Setting up Service for {:?}", rolegroup_ref); - - let service_spec = ServiceSpec { - // Internal communication does not need to be exposed - type_: Some("ClusterIP".to_string()), - cluster_ip: Some("None".to_string()), - ports: Some( - hdfs.ports(role) - .into_iter() - .map(|(name, value)| ServicePort { - name: Some(name), - port: i32::from(value), - protocol: Some("TCP".to_string()), - ..ServicePort::default() - }) - .collect(), - ), - selector: Some( - hdfs.rolegroup_selector_labels(rolegroup_ref) - .context(RoleGroupSelectorLabelsSnafu)? - .into(), - ), - publish_not_ready_addresses: Some(true), - ..ServiceSpec::default() - }; - - Ok(Service { - metadata: metadata.build(), - spec: Some(service_spec), - status: None, - }) -} - -fn rolegroup_metrics_service( - hdfs: &v1alpha1::HdfsCluster, - role: &HdfsNodeRole, - rolegroup_ref: &RoleGroupRef, - resolved_product_image: &ResolvedProductImage, -) -> HdfsOperatorResult { - tracing::info!("Setting up metrics Service for {:?}", rolegroup_ref); - - let service_spec = ServiceSpec { - // Internal communication does not need to be exposed - type_: Some("ClusterIP".to_string()), - cluster_ip: Some("None".to_string()), - ports: Some( - hdfs.metrics_ports(role) - .into_iter() - .map(|(name, value)| ServicePort { - name: Some(name), - port: i32::from(value), - protocol: Some("TCP".to_string()), - ..ServicePort::default() - }) - .collect(), - ), - selector: Some( - hdfs.rolegroup_selector_labels(rolegroup_ref) - .context(RoleGroupSelectorLabelsSnafu)? - .into(), - ), - publish_not_ready_addresses: Some(true), - ..ServiceSpec::default() - }; - - Ok(Service { - metadata: ObjectMetaBuilder::new() - .name_and_namespace(hdfs) - .name(rolegroup_ref.rolegroup_metrics_service_name()) - .ownerreference_from_resource(hdfs, None, Some(true)) - .with_context(|_| ObjectMissingMetadataForOwnerRefSnafu { - obj_ref: ObjectRef::from_obj(hdfs), - })? - .with_recommended_labels(build_recommended_labels( - hdfs, - RESOURCE_MANAGER_HDFS_CONTROLLER, - &resolved_product_image.app_version_label_value, - &rolegroup_ref.role, - &rolegroup_ref.role_group, - )) - .context(ObjectMetaSnafu)? - .with_label( - Label::try_from(("prometheus.io/scrape", "true")) - .context(BuildPrometheusLabelSnafu)?, - ) - .with_annotations( - Annotations::try_from([ - ("prometheus.io/path".to_owned(), "/metrics".to_owned()), - ( - "prometheus.io/port".to_owned(), - hdfs.metrics_port(role).to_string(), - ), - ("prometheus.io/scheme".to_owned(), "http".to_owned()), - ("prometheus.io/scrape".to_owned(), "true".to_owned()), - ]) - .expect("should be valid annotations"), - ) - .build(), - spec: Some(service_spec), - status: None, - }) -} - -fn rolegroup_native_metrics_service( - hdfs: &v1alpha1::HdfsCluster, - role: &HdfsNodeRole, - rolegroup_ref: &RoleGroupRef, - resolved_product_image: &ResolvedProductImage, -) -> HdfsOperatorResult { - tracing::info!("Setting up native metrics Service for {:?}", rolegroup_ref); - - let service_spec = ServiceSpec { - // Internal communication does not need to be exposed - type_: Some("ClusterIP".to_string()), - cluster_ip: Some("None".to_string()), - ports: Some( - hdfs.native_metrics_ports(role) - .into_iter() - .map(|(name, value)| ServicePort { - name: Some(name), - port: i32::from(value), - protocol: Some("TCP".to_string()), - ..ServicePort::default() - }) - .collect(), - ), - selector: Some( - hdfs.rolegroup_selector_labels(rolegroup_ref) - .context(RoleGroupSelectorLabelsSnafu)? - .into(), - ), - publish_not_ready_addresses: Some(true), - ..ServiceSpec::default() - }; - - Ok(Service { - metadata: ObjectMetaBuilder::new() - .name_and_namespace(hdfs) - .name(format!( - "{name}-native-metrics", - name = rolegroup_ref.object_name() - )) - .ownerreference_from_resource(hdfs, None, Some(true)) - .with_context(|_| ObjectMissingMetadataForOwnerRefSnafu { - obj_ref: ObjectRef::from_obj(hdfs), - })? - .with_recommended_labels(build_recommended_labels( - hdfs, - RESOURCE_MANAGER_HDFS_CONTROLLER, - &resolved_product_image.app_version_label_value, - &rolegroup_ref.role, - &rolegroup_ref.role_group, - )) - .context(ObjectMetaSnafu)? - .with_label( - Label::try_from(("prometheus.io/scrape", "true")) - .context(BuildPrometheusLabelSnafu)?, - ) - .with_annotations( - Annotations::try_from([ - ("prometheus.io/path".to_owned(), "/prom".to_owned()), - ( - "prometheus.io/port".to_owned(), - hdfs.native_metrics_port(role).to_string(), - ), - ( - "prometheus.io/scheme".to_owned(), - if hdfs.has_https_enabled() { - "https".to_owned() - } else { - "http".to_owned() - }, - ), - ("prometheus.io/scrape".to_owned(), "true".to_owned()), - ]) - .expect("should be valid annotations"), - ) - .build(), - spec: Some(service_spec), - status: None, - }) -} - #[allow(clippy::too_many_arguments)] fn rolegroup_config_map( hdfs: &v1alpha1::HdfsCluster, @@ -1026,7 +841,6 @@ fn rolegroup_statefulset( ) -> HdfsOperatorResult { tracing::info!("Setting up StatefulSet for {:?}", rolegroup_ref); - let object_name = rolegroup_ref.object_name(); // PodBuilder for StatefulSet Pod template. let mut pb = PodBuilder::new(); @@ -1061,7 +875,7 @@ fn rolegroup_statefulset( merged_config, env_overrides, &hdfs.spec.cluster_config.zookeeper_config_map_name, - &object_name, + &rolegroup_ref.object_name(), namenode_podrefs, &rolegroup_selector_labels, ) @@ -1091,7 +905,7 @@ fn rolegroup_statefulset( match_labels: Some(rolegroup_selector_labels.into()), ..LabelSelector::default() }, - service_name: Some(object_name), + service_name: Some(rolegroup_ref.rolegroup_headless_service_name()), template: pod_template, volume_claim_templates: Some(pvcs), diff --git a/rust/operator-binary/src/main.rs b/rust/operator-binary/src/main.rs index 4ea60b2b..3701ac3b 100644 --- a/rust/operator-binary/src/main.rs +++ b/rust/operator-binary/src/main.rs @@ -47,6 +47,7 @@ mod hdfs_controller; mod operations; mod product_logging; mod security; +mod service; mod built_info { include!(concat!(env!("OUT_DIR"), "/built.rs")); diff --git a/rust/operator-binary/src/service.rs b/rust/operator-binary/src/service.rs new file mode 100644 index 00000000..f511df63 --- /dev/null +++ b/rust/operator-binary/src/service.rs @@ -0,0 +1,244 @@ +use snafu::{ResultExt, Snafu}; +use stackable_operator::{ + builder::meta::ObjectMetaBuilder, + commons::product_image_selection::ResolvedProductImage, + k8s_openapi::api::core::v1::{Service, ServicePort, ServiceSpec}, + kube::runtime::reflector::ObjectRef, + kvp::{Annotations, Label, LabelError}, + role_utils::RoleGroupRef, +}; + +use crate::{ + build_recommended_labels, + crd::{HdfsNodeRole, v1alpha1}, + hdfs_controller::RESOURCE_MANAGER_HDFS_CONTROLLER, +}; + +#[derive(Snafu, Debug)] +pub enum Error { + #[snafu(display("failed to build prometheus label"))] + BuildPrometheusLabel { source: LabelError }, + + #[snafu(display("failed to build role-group selector label"))] + BuildRoleGroupSelectorLabel { source: LabelError }, + + #[snafu(display("failed to build object meta data"))] + ObjectMeta { + source: stackable_operator::builder::meta::Error, + }, + + #[snafu(display("no metadata for {obj_ref:?}"))] + ObjectMissingMetadataForOwnerRef { + source: stackable_operator::builder::meta::Error, + obj_ref: ObjectRef, + }, + + #[snafu(display("failed to build roleGroup selector labels"))] + RoleGroupSelectorLabels { source: crate::crd::Error }, +} + +pub(crate) fn rolegroup_headless_service( + hdfs: &v1alpha1::HdfsCluster, + role: &HdfsNodeRole, + rolegroup_ref: &RoleGroupRef, + resolved_product_image: &ResolvedProductImage, +) -> Result { + tracing::info!("Setting up Service for {:?}", rolegroup_ref); + + let mut metadata_builder = ObjectMetaBuilder::new(); + metadata_builder + .name_and_namespace(hdfs) + .name(rolegroup_ref.rolegroup_headless_service_name()) + .ownerreference_from_resource(hdfs, None, Some(true)) + .with_context(|_| ObjectMissingMetadataForOwnerRefSnafu { + obj_ref: ObjectRef::from_obj(hdfs), + })? + .with_recommended_labels(build_recommended_labels( + hdfs, + RESOURCE_MANAGER_HDFS_CONTROLLER, + &resolved_product_image.app_version_label_value, + &rolegroup_ref.role, + &rolegroup_ref.role_group, + )) + .context(ObjectMetaSnafu)?; + + let service_spec = ServiceSpec { + // Internal communication does not need to be exposed + type_: Some("ClusterIP".to_string()), + cluster_ip: Some("None".to_string()), + ports: Some( + hdfs.ports(role) + .into_iter() + .map(|(name, value)| ServicePort { + name: Some(name), + port: i32::from(value), + protocol: Some("TCP".to_string()), + ..ServicePort::default() + }) + .collect(), + ), + selector: Some( + hdfs.rolegroup_selector_labels(rolegroup_ref) + .context(RoleGroupSelectorLabelsSnafu)? + .into(), + ), + publish_not_ready_addresses: Some(true), + ..ServiceSpec::default() + }; + + Ok(Service { + metadata: metadata_builder.build(), + spec: Some(service_spec), + status: None, + }) +} + +pub(crate) fn rolegroup_metrics_service( + hdfs: &v1alpha1::HdfsCluster, + role: &HdfsNodeRole, + rolegroup_ref: &RoleGroupRef, + resolved_product_image: &ResolvedProductImage, +) -> Result { + tracing::info!("Setting up metrics Service for {:?}", rolegroup_ref); + + let service_spec = ServiceSpec { + // Internal communication does not need to be exposed + type_: Some("ClusterIP".to_string()), + cluster_ip: Some("None".to_string()), + ports: Some( + hdfs.metrics_ports(role) + .into_iter() + .map(|(name, value)| ServicePort { + name: Some(name), + port: i32::from(value), + protocol: Some("TCP".to_string()), + ..ServicePort::default() + }) + .collect(), + ), + selector: Some( + hdfs.rolegroup_selector_labels(rolegroup_ref) + .context(RoleGroupSelectorLabelsSnafu)? + .into(), + ), + publish_not_ready_addresses: Some(true), + ..ServiceSpec::default() + }; + + Ok(Service { + metadata: ObjectMetaBuilder::new() + .name_and_namespace(hdfs) + .name(rolegroup_ref.rolegroup_metrics_service_name()) + .ownerreference_from_resource(hdfs, None, Some(true)) + .with_context(|_| ObjectMissingMetadataForOwnerRefSnafu { + obj_ref: ObjectRef::from_obj(hdfs), + })? + .with_recommended_labels(build_recommended_labels( + hdfs, + RESOURCE_MANAGER_HDFS_CONTROLLER, + &resolved_product_image.app_version_label_value, + &rolegroup_ref.role, + &rolegroup_ref.role_group, + )) + .context(ObjectMetaSnafu)? + .with_label( + Label::try_from(("prometheus.io/scrape", "true")) + .context(BuildPrometheusLabelSnafu)?, + ) + .with_annotations( + Annotations::try_from([ + ("prometheus.io/path".to_owned(), "/metrics".to_owned()), + ( + "prometheus.io/port".to_owned(), + hdfs.metrics_port(role).to_string(), + ), + ("prometheus.io/scheme".to_owned(), "http".to_owned()), + ("prometheus.io/scrape".to_owned(), "true".to_owned()), + ]) + .expect("should be valid annotations"), + ) + .build(), + spec: Some(service_spec), + status: None, + }) +} + +pub(crate) fn rolegroup_native_metrics_service( + hdfs: &v1alpha1::HdfsCluster, + role: &HdfsNodeRole, + rolegroup_ref: &RoleGroupRef, + resolved_product_image: &ResolvedProductImage, +) -> Result { + tracing::info!("Setting up native metrics Service for {:?}", rolegroup_ref); + + let service_spec = ServiceSpec { + // Internal communication does not need to be exposed + type_: Some("ClusterIP".to_string()), + cluster_ip: Some("None".to_string()), + ports: Some( + hdfs.native_metrics_ports(role) + .into_iter() + .map(|(name, value)| ServicePort { + name: Some(name), + port: i32::from(value), + protocol: Some("TCP".to_string()), + ..ServicePort::default() + }) + .collect(), + ), + selector: Some( + hdfs.rolegroup_selector_labels(rolegroup_ref) + .context(RoleGroupSelectorLabelsSnafu)? + .into(), + ), + publish_not_ready_addresses: Some(true), + ..ServiceSpec::default() + }; + + Ok(Service { + metadata: ObjectMetaBuilder::new() + .name_and_namespace(hdfs) + .name(format!( + "{name}-native-metrics", + name = rolegroup_ref.object_name() + )) + .ownerreference_from_resource(hdfs, None, Some(true)) + .with_context(|_| ObjectMissingMetadataForOwnerRefSnafu { + obj_ref: ObjectRef::from_obj(hdfs), + })? + .with_recommended_labels(build_recommended_labels( + hdfs, + RESOURCE_MANAGER_HDFS_CONTROLLER, + &resolved_product_image.app_version_label_value, + &rolegroup_ref.role, + &rolegroup_ref.role_group, + )) + .context(ObjectMetaSnafu)? + .with_label( + Label::try_from(("prometheus.io/scrape", "true")) + .context(BuildPrometheusLabelSnafu)?, + ) + .with_annotations( + Annotations::try_from([ + ("prometheus.io/path".to_owned(), "/prom".to_owned()), + ( + "prometheus.io/port".to_owned(), + hdfs.native_metrics_port(role).to_string(), + ), + ( + "prometheus.io/scheme".to_owned(), + if hdfs.has_https_enabled() { + "https".to_owned() + } else { + "http".to_owned() + }, + ), + ("prometheus.io/scrape".to_owned(), "true".to_owned()), + ]) + .expect("should be valid annotations"), + ) + .build(), + spec: Some(service_spec), + status: None, + }) +} From 7cc2e2ff222976be80b17a869b7db619e543c8e0 Mon Sep 17 00:00:00 2001 From: Malte Sander Date: Wed, 22 Oct 2025 15:17:53 +0200 Subject: [PATCH 03/18] remove second metrics service, consolidate ports --- rust/operator-binary/src/container.rs | 26 +++---- rust/operator-binary/src/crd/constants.rs | 1 + rust/operator-binary/src/crd/mod.rs | 45 +++++++++--- rust/operator-binary/src/hdfs_controller.rs | 19 +---- rust/operator-binary/src/service.rs | 77 +-------------------- 5 files changed, 51 insertions(+), 117 deletions(-) diff --git a/rust/operator-binary/src/container.rs b/rust/operator-binary/src/container.rs index f23179db..f13ad8a1 100644 --- a/rust/operator-binary/src/container.rs +++ b/rust/operator-binary/src/container.rs @@ -488,9 +488,7 @@ impl ContainerConfig { )?) .add_volume_mounts(self.volume_mounts(hdfs, merged_config, labels)?) .context(AddVolumeMountSnafu)? - .add_container_ports(self.container_ports(hdfs)) - // TODO: This currently adds the metrics port also to the zkfc containers, not needed there? - .add_container_port(SERVICE_PORT_NAME_METRICS, hdfs.metrics_port(role).into()); + .add_container_ports(self.container_ports(hdfs)); if let Some(resources) = resources { cb.resources(resources); @@ -1251,16 +1249,18 @@ wait_for_termination $! /// Container ports for the main containers namenode, datanode and journalnode. fn container_ports(&self, hdfs: &v1alpha1::HdfsCluster) -> Vec { match self { - ContainerConfig::Hdfs { role, .. } => hdfs - .ports(role) - .into_iter() - .map(|(name, value)| ContainerPort { - name: Some(name), - container_port: i32::from(value), - protocol: Some("TCP".to_string()), - ..ContainerPort::default() - }) - .collect(), + ContainerConfig::Hdfs { role, .. } => { + // data ports + hdfs.hdfs_main_container_ports(role) + .into_iter() + .map(|(name, value)| ContainerPort { + name: Some(name), + container_port: i32::from(value), + protocol: Some("TCP".to_string()), + ..ContainerPort::default() + }) + .collect() + } _ => { vec![] } diff --git a/rust/operator-binary/src/crd/constants.rs b/rust/operator-binary/src/crd/constants.rs index 37857b6d..44266d0c 100644 --- a/rust/operator-binary/src/crd/constants.rs +++ b/rust/operator-binary/src/crd/constants.rs @@ -20,6 +20,7 @@ pub const SERVICE_PORT_NAME_HTTP: &str = "http"; pub const SERVICE_PORT_NAME_HTTPS: &str = "https"; pub const SERVICE_PORT_NAME_DATA: &str = "data"; pub const SERVICE_PORT_NAME_METRICS: &str = "metrics"; +pub const SERVICE_PORT_NAME_JMX_METRICS: &str = "jmx-metrics"; pub const DEFAULT_LISTENER_CLASS: &str = "cluster-internal"; diff --git a/rust/operator-binary/src/crd/mod.rs b/rust/operator-binary/src/crd/mod.rs index b1a29c43..bd3674bf 100644 --- a/rust/operator-binary/src/crd/mod.rs +++ b/rust/operator-binary/src/crd/mod.rs @@ -65,7 +65,8 @@ use crate::crd::{ DEFAULT_NAME_NODE_RPC_PORT, DFS_REPLICATION, HADOOP_POLICY_XML, HDFS_SITE_XML, JVM_SECURITY_PROPERTIES_FILE, LISTENER_VOLUME_NAME, SERVICE_PORT_NAME_DATA, SERVICE_PORT_NAME_HTTP, SERVICE_PORT_NAME_HTTPS, SERVICE_PORT_NAME_IPC, - SERVICE_PORT_NAME_METRICS, SERVICE_PORT_NAME_RPC, SSL_CLIENT_XML, SSL_SERVER_XML, + SERVICE_PORT_NAME_JMX_METRICS, SERVICE_PORT_NAME_METRICS, SERVICE_PORT_NAME_RPC, + SSL_CLIENT_XML, SSL_SERVER_XML, }, security::{AuthenticationConfig, KerberosConfig}, storage::{ @@ -390,10 +391,10 @@ impl v1alpha1::HdfsCluster { let ns = ns.clone(); (0..*replicas).map(move |i| HdfsPodRef { namespace: ns.clone(), - role_group_service_name: rolegroup_ref.object_name(), + role_group_service_name: rolegroup_ref.rolegroup_headless_service_name(), pod_name: format!("{}-{}", rolegroup_ref.object_name(), i), ports: self - .ports(role) + .data_ports(role) .iter() .map(|(n, p)| (n.clone(), *p)) .collect(), @@ -671,7 +672,7 @@ impl v1alpha1::HdfsCluster { } /// Returns required port name and port number tuples depending on the role. - pub fn ports(&self, role: &HdfsNodeRole) -> Vec<(String, u16)> { + pub fn data_ports(&self, role: &HdfsNodeRole) -> Vec<(String, u16)> { match role { HdfsNodeRole::Name => vec![ ( @@ -731,26 +732,26 @@ impl v1alpha1::HdfsCluster { } } - /// Returns required metrics port name and metrics port number tuples depending on the role. - pub fn metrics_ports(&self, role: &HdfsNodeRole) -> Vec<(String, u16)> { + /// Deprecated required JMX metrics port name and metrics port number tuples depending on the role. + pub fn jmx_metrics_ports(&self, role: &HdfsNodeRole) -> Vec<(String, u16)> { match role { HdfsNodeRole::Name => vec![( - String::from(SERVICE_PORT_NAME_METRICS), + String::from(SERVICE_PORT_NAME_JMX_METRICS), DEFAULT_NAME_NODE_METRICS_PORT, )], HdfsNodeRole::Data => vec![( - String::from(SERVICE_PORT_NAME_METRICS), + String::from(SERVICE_PORT_NAME_JMX_METRICS), DEFAULT_DATA_NODE_METRICS_PORT, )], HdfsNodeRole::Journal => vec![( - String::from(SERVICE_PORT_NAME_METRICS), + String::from(SERVICE_PORT_NAME_JMX_METRICS), DEFAULT_JOURNAL_NODE_METRICS_PORT, )], } } - /// Returns required metrics port name and native metrics port number tuples depending on the role. - pub fn native_metrics_ports(&self, role: &HdfsNodeRole) -> Vec<(String, u16)> { + /// Returns required metrics port name and metrics port number tuples depending on the role and security settings. + pub fn metrics_ports(&self, role: &HdfsNodeRole) -> Vec<(String, u16)> { match role { HdfsNodeRole::Name => vec![if self.has_https_enabled() { ( @@ -821,6 +822,28 @@ impl v1alpha1::HdfsCluster { } } } + + pub fn metrics_service_ports(&self, role: &HdfsNodeRole) -> Vec<(String, u16)> { + let mut metrics_service_ports = vec![]; + // "native" ports + metrics_service_ports.extend(self.metrics_ports(role)); + metrics_service_ports.extend(self.jmx_metrics_ports(role)); + metrics_service_ports + } + + pub fn headless_service_ports(&self, role: &HdfsNodeRole) -> Vec<(String, u16)> { + let mut headless_service_ports = vec![]; + headless_service_ports.extend(self.data_ports(role)); + headless_service_ports + } + + pub fn hdfs_main_container_ports(&self, role: &HdfsNodeRole) -> Vec<(String, u16)> { + let mut main_container_ports = vec![]; + main_container_ports.extend(self.data_ports(role)); + // TODO: This will be exposed in the listener if added to container ports? + // main_container_ports.extend(self.jmx_metrics_ports(role)); + main_container_ports + } } #[derive(Clone, Debug, Deserialize, Eq, Hash, JsonSchema, PartialEq, Serialize)] diff --git a/rust/operator-binary/src/hdfs_controller.rs b/rust/operator-binary/src/hdfs_controller.rs index ce178b0f..f1ab314a 100644 --- a/rust/operator-binary/src/hdfs_controller.rs +++ b/rust/operator-binary/src/hdfs_controller.rs @@ -69,10 +69,7 @@ use crate::{ }, product_logging::extend_role_group_config_map, security::{self, kerberos, opa::HdfsOpaConfig}, - service::{ - self, rolegroup_headless_service, rolegroup_metrics_service, - rolegroup_native_metrics_service, - }, + service::{self, rolegroup_headless_service, rolegroup_metrics_service}, }; pub const RESOURCE_MANAGER_HDFS_CONTROLLER: &str = "hdfs-operator-hdfs-controller"; @@ -399,13 +396,6 @@ pub async fn reconcile_hdfs( let rg_metrics_service = rolegroup_metrics_service(hdfs, &role, &rolegroup_ref, &resolved_product_image) .context(BuildServiceSnafu)?; - let rg_native_metrics_service = rolegroup_native_metrics_service( - hdfs, - &role, - &rolegroup_ref, - &resolved_product_image, - ) - .context(BuildServiceSnafu)?; // We need to split the creation and the usage of the "metadata" variable in two statements. // to avoid the compiler error "E0716 (temporary value dropped while borrowed)". @@ -453,7 +443,6 @@ pub async fn reconcile_hdfs( let rg_service_name = rg_service.name_any(); let rg_metrics_service_name = rg_metrics_service.name_any(); - let rg_native_metrics_service_name = rg_native_metrics_service.name_any(); cluster_resources .add(client, rg_service) @@ -467,12 +456,6 @@ pub async fn reconcile_hdfs( .with_context(|_| ApplyRoleGroupServiceSnafu { name: rg_metrics_service_name, })?; - cluster_resources - .add(client, rg_native_metrics_service) - .await - .with_context(|_| ApplyRoleGroupServiceSnafu { - name: rg_native_metrics_service_name, - })?; let rg_configmap_name = rg_configmap.name_any(); cluster_resources .add(client, rg_configmap.clone()) diff --git a/rust/operator-binary/src/service.rs b/rust/operator-binary/src/service.rs index f511df63..bac5cdca 100644 --- a/rust/operator-binary/src/service.rs +++ b/rust/operator-binary/src/service.rs @@ -67,7 +67,7 @@ pub(crate) fn rolegroup_headless_service( type_: Some("ClusterIP".to_string()), cluster_ip: Some("None".to_string()), ports: Some( - hdfs.ports(role) + hdfs.headless_service_ports(role) .into_iter() .map(|(name, value)| ServicePort { name: Some(name), @@ -106,7 +106,7 @@ pub(crate) fn rolegroup_metrics_service( type_: Some("ClusterIP".to_string()), cluster_ip: Some("None".to_string()), ports: Some( - hdfs.metrics_ports(role) + hdfs.metrics_service_ports(role) .into_iter() .map(|(name, value)| ServicePort { name: Some(name), @@ -145,79 +145,6 @@ pub(crate) fn rolegroup_metrics_service( Label::try_from(("prometheus.io/scrape", "true")) .context(BuildPrometheusLabelSnafu)?, ) - .with_annotations( - Annotations::try_from([ - ("prometheus.io/path".to_owned(), "/metrics".to_owned()), - ( - "prometheus.io/port".to_owned(), - hdfs.metrics_port(role).to_string(), - ), - ("prometheus.io/scheme".to_owned(), "http".to_owned()), - ("prometheus.io/scrape".to_owned(), "true".to_owned()), - ]) - .expect("should be valid annotations"), - ) - .build(), - spec: Some(service_spec), - status: None, - }) -} - -pub(crate) fn rolegroup_native_metrics_service( - hdfs: &v1alpha1::HdfsCluster, - role: &HdfsNodeRole, - rolegroup_ref: &RoleGroupRef, - resolved_product_image: &ResolvedProductImage, -) -> Result { - tracing::info!("Setting up native metrics Service for {:?}", rolegroup_ref); - - let service_spec = ServiceSpec { - // Internal communication does not need to be exposed - type_: Some("ClusterIP".to_string()), - cluster_ip: Some("None".to_string()), - ports: Some( - hdfs.native_metrics_ports(role) - .into_iter() - .map(|(name, value)| ServicePort { - name: Some(name), - port: i32::from(value), - protocol: Some("TCP".to_string()), - ..ServicePort::default() - }) - .collect(), - ), - selector: Some( - hdfs.rolegroup_selector_labels(rolegroup_ref) - .context(RoleGroupSelectorLabelsSnafu)? - .into(), - ), - publish_not_ready_addresses: Some(true), - ..ServiceSpec::default() - }; - - Ok(Service { - metadata: ObjectMetaBuilder::new() - .name_and_namespace(hdfs) - .name(format!( - "{name}-native-metrics", - name = rolegroup_ref.object_name() - )) - .ownerreference_from_resource(hdfs, None, Some(true)) - .with_context(|_| ObjectMissingMetadataForOwnerRefSnafu { - obj_ref: ObjectRef::from_obj(hdfs), - })? - .with_recommended_labels(build_recommended_labels( - hdfs, - RESOURCE_MANAGER_HDFS_CONTROLLER, - &resolved_product_image.app_version_label_value, - &rolegroup_ref.role, - &rolegroup_ref.role_group, - )) - .context(ObjectMetaSnafu)? - .with_label( - Label::try_from(("prometheus.io/scrape", "true")) - .context(BuildPrometheusLabelSnafu)?, - ) .with_annotations( Annotations::try_from([ ("prometheus.io/path".to_owned(), "/prom".to_owned()), From 3753ede7f37c309be2bc4a7eeefc9eee87d68189 Mon Sep 17 00:00:00 2001 From: Malte Sander Date: Wed, 22 Oct 2025 16:21:48 +0200 Subject: [PATCH 04/18] start fixing smoke tests --- tests/templates/kuttl/smoke/30-assert.yaml.j2 | 94 +++++++++++++++ tests/templates/kuttl/smoke/51-assert.yaml.j2 | 7 +- .../smoke/51-copy-metrics-test-script.yaml | 4 +- .../{test_metrics.py => test_jmx_metrics.py} | 6 +- .../kuttl/smoke/test_native_metrics.py | 77 +++++++++++++ .../kuttl/smoke/test_prometheus_metrics.py | 109 ------------------ tests/templates/kuttl/smoke/webhdfs.py | 4 +- 7 files changed, 181 insertions(+), 120 deletions(-) rename tests/templates/kuttl/smoke/{test_metrics.py => test_jmx_metrics.py} (92%) create mode 100755 tests/templates/kuttl/smoke/test_native_metrics.py delete mode 100644 tests/templates/kuttl/smoke/test_prometheus_metrics.py diff --git a/tests/templates/kuttl/smoke/30-assert.yaml.j2 b/tests/templates/kuttl/smoke/30-assert.yaml.j2 index 0b8dffa2..7e8f72c6 100644 --- a/tests/templates/kuttl/smoke/30-assert.yaml.j2 +++ b/tests/templates/kuttl/smoke/30-assert.yaml.j2 @@ -78,6 +78,100 @@ status: {% if test_scenario['values']['datanode-pvcs'] == '2hdd-1ssd' %} --- apiVersion: v1 +kind: Service +metadata: + name: hdfs-namenode-default-headless +spec: + ports: + - name: rpc + port: 8020 + protocol: TCP + targetPort: 8020 + - name: http + port: 9070 + protocol: TCP + targetPort: 9070 +--- +apiVersion: v1 +kind: Service +metadata: + name: hdfs-namenode-default-metrics +spec: + ports: + - name: metrics + port: 9870 + protocol: TCP + targetPort: 9870 + - name: jmx-metrics + port: 8183 + protocol: TCP + targetPort: 8183 +--- +apiVersion: v1 +kind: Service +metadata: + name: hdfs-datanode-default-headless +spec: + ports: + - name: data + port: 9866 + protocol: TCP + targetPort: 9866 + - name: ipc + port: 9867 + protocol: TCP + targetPort: 9867 + - name: http + port: 9864 + protocol: TCP + targetPort: 9864 +--- +apiVersion: v1 +kind: Service +metadata: + name: hdfs-datanode-default-metrics +spec: + ports: + - name: metrics + port: 9864 + protocol: TCP + targetPort: 9864 + - name: jmx-metrics + port: 8082 + protocol: TCP + targetPort: 8082 +--- +apiVersion: v1 +kind: Service +metadata: + name: hdfs-journal-default-headless +spec: + ports: + - name: rpc + port: 8485 + protocol: TCP + targetPort: 8485 + - name: http + port: 8480 + protocol: TCP + targetPort: 8480 +--- +apiVersion: v1 +kind: Service +metadata: + name: hdfs-journal-default-metrics +spec: + ports: + - name: metrics + port: 8480 + protocol: TCP + targetPort: 8480 + - name: jmx-metrics + port: 8081 + protocol: TCP + targetPort: 8081 +--- +apiVersion: v1 kind: PersistentVolumeClaim metadata: name: hdd-hdfs-datanode-default-0 diff --git a/tests/templates/kuttl/smoke/51-assert.yaml.j2 b/tests/templates/kuttl/smoke/51-assert.yaml.j2 index 6f57dda2..c403bd94 100644 --- a/tests/templates/kuttl/smoke/51-assert.yaml.j2 +++ b/tests/templates/kuttl/smoke/51-assert.yaml.j2 @@ -8,9 +8,8 @@ commands: {% else %} PRODUCT_VERSION={{ test_scenario['values']['hadoop'] }} {% endif %} - # Test JMX exported metrics + # Test exported metrics kubectl exec --namespace=$NAMESPACE test-runner-0 -- \ - python /tmp/test_metrics.py $NAMESPACE $PRODUCT_VERSION - # Test Prometheus metrics + python /tmp/test_jmx_metrics.py $NAMESPACE $PRODUCT_VERSION kubectl exec --namespace=$NAMESPACE test-runner-0 -- \ - python /tmp/test_prometheus_metrics.py $NAMESPACE $PRODUCT_VERSION + python /tmp/test_native_metrics.py $NAMESPACE diff --git a/tests/templates/kuttl/smoke/51-copy-metrics-test-script.yaml b/tests/templates/kuttl/smoke/51-copy-metrics-test-script.yaml index bb617f97..cc94b9ff 100644 --- a/tests/templates/kuttl/smoke/51-copy-metrics-test-script.yaml +++ b/tests/templates/kuttl/smoke/51-copy-metrics-test-script.yaml @@ -2,5 +2,5 @@ apiVersion: kuttl.dev/v1beta1 kind: TestStep commands: - - script: kubectl cp -n $NAMESPACE ./test_metrics.py test-runner-0:/tmp - - script: kubectl cp -n $NAMESPACE ./test_prometheus_metrics.py test-runner-0:/tmp + - script: kubectl cp -n $NAMESPACE ./test_jmx_metrics.py test-runner-0:/tmp + - script: kubectl cp -n $NAMESPACE ./test_native_metrics.py test-runner-0:/tmp diff --git a/tests/templates/kuttl/smoke/test_metrics.py b/tests/templates/kuttl/smoke/test_jmx_metrics.py similarity index 92% rename from tests/templates/kuttl/smoke/test_metrics.py rename to tests/templates/kuttl/smoke/test_jmx_metrics.py index 066c3e11..41aeef00 100755 --- a/tests/templates/kuttl/smoke/test_metrics.py +++ b/tests/templates/kuttl/smoke/test_jmx_metrics.py @@ -11,7 +11,7 @@ def check_metrics( namespace: str, role: str, port: int, expected_metrics: list[str] ) -> None: response: requests.Response = requests.get( - f"http://hdfs-{role}-default-0.hdfs-{role}-default.{namespace}.svc.cluster.local:{port}/metrics", + f"http://hdfs-{role}-default-metrics.{namespace}.svc.cluster.local:{port}/metrics", timeout=10, ) assert response.ok, "Requesting metrics failed" @@ -65,9 +65,9 @@ def check_datanode_metrics( # Kind "FSDatasetState" 'hadoop_datanode_capacity{fsdatasetid=".+",kind="FSDatasetState",role="DataNode",service="HDFS"}', # Kind "DataNodeActivity" suffixed with "_info" - 'hadoop_datanode_blocks_get_local_path_info_{host="hdfs-datanode-default-0\\.hdfs-datanode-default\\..+\\.svc\\.cluster\\.local",kind="DataNodeActivity",port="9866",role="DataNode",service="HDFS"}', + 'hadoop_datanode_blocks_get_local_path_info_{host="hdfs-datanode-default-0\\.hdfs-datanode-default-headless\\..+\\.svc\\.cluster\\.local",kind="DataNodeActivity",port="9866",role="DataNode",service="HDFS"}', # Kind "DataNodeActivity" - 'hadoop_datanode_blocks_read{host="hdfs-datanode-default-0\\.hdfs-datanode-default\\..+\\.svc\\.cluster\\.local",kind="DataNodeActivity",port="9866",role="DataNode",service="HDFS"}', + 'hadoop_datanode_blocks_read{host="hdfs-datanode-default-0\\.hdfs-datanode-default-headless\\..+\\.svc\\.cluster\\.local",kind="DataNodeActivity",port="9866",role="DataNode",service="HDFS"}', # Counter suffixed with "_total" 'hadoop_datanode_estimated_capacity_lost_total{kind="FSDatasetState",role="DataNode",service="HDFS"}', # Boolean metric diff --git a/tests/templates/kuttl/smoke/test_native_metrics.py b/tests/templates/kuttl/smoke/test_native_metrics.py new file mode 100755 index 00000000..3dc5034e --- /dev/null +++ b/tests/templates/kuttl/smoke/test_native_metrics.py @@ -0,0 +1,77 @@ +# Every rule in the JMX configuration is covered by one expected metric. + +import re +import sys +import logging + +import requests + + +def check_metrics( + namespace: str, role: str, port: int, expected_metrics: list[str] +) -> None: + response: requests.Response = requests.get( + f"http://hdfs-{role}-default-metrics.{namespace}.svc.cluster.local:{port}/prom", + timeout=10, + ) + assert response.ok, "Requesting metrics failed" + + for metric in expected_metrics: + assert re.search(f"^{metric}", response.text, re.MULTILINE) is not None, ( + f"Metric '{metric}' not found for {role}" + ) + + +def check_namenode_metrics( + namespace: str, +) -> None: + expected_metrics: list[str] = [ + 'metrics_system_num_active_sources{context="metricssystem",hostname="hdfs-namenode-default-', + 'namenode_total_file_ops{processname="NameNode",sessionid="null",context="dfs",hostname="hdfs-namenode-default', + 'namenode_files_created{processname="NameNode",sessionid="null",context="dfs",hostname="hdfs-namenode-default-', + 'namenode_files_deleted{processname="NameNode",sessionid="null",context="dfs",hostname="hdfs-namenode-default-', + ] + + check_metrics(namespace, "namenode", 9870, expected_metrics) + + +def check_datanode_metrics( + namespace: str, +) -> None: + expected_metrics: list[str] = [ + 'metrics_system_num_active_sources{context="metricssystem",hostname="hdfs-datanode-default-0"}', + 'org_apache_hadoop_hdfs_server_datanode_fsdataset_impl_fs_dataset_impl_capacity{context="FSDatasetState",storageinfo="FSDataset{dirpath=\'[/stackable/data/data/datanode]\'}",hostname="hdfs-datanode-default-0"}', + 'org_apache_hadoop_hdfs_server_datanode_fsdataset_impl_fs_dataset_impl_estimated_capacity_lost_total{context="FSDatasetState",storageinfo="FSDataset{dirpath=\'[/stackable/data/data/datanode]\'}",hostname="hdfs-datanode-default-0"}', + 'datanode_blocks_get_local_path_info{sessionid="null",context="dfs",hostname="hdfs-datanode-default-0"}', + 'datanode_blocks_read{sessionid="null",context="dfs",hostname="hdfs-datanode-default-0"}', + 'jvm_metrics_gc_count{context="jvm",processname="DataNode",sessionid="null",hostname="hdfs-datanode-default-0"}', + ] + + check_metrics(namespace, "datanode", 9864, expected_metrics) + + +def check_journalnode_metrics( + namespace: str, +) -> None: + expected_metrics: list[str] = [ + 'metrics_system_num_active_sources{context="metricssystem",hostname="hdfs-journalnode-default-0"}', + 'journal_node_bytes_written{context="dfs",journalid="hdfs",hostname="hdfs-journalnode-default-0"}', + ] + + check_metrics(namespace, "journalnode", 8480, expected_metrics) + + +if __name__ == "__main__": + namespace_arg: str = sys.argv[1] + + logging.basicConfig( + level="DEBUG", + format="%(asctime)s %(levelname)s: %(message)s", + stream=sys.stdout, + ) + + check_namenode_metrics(namespace_arg) + check_datanode_metrics(namespace_arg) + check_journalnode_metrics(namespace_arg) + + print("All expected metrics found") diff --git a/tests/templates/kuttl/smoke/test_prometheus_metrics.py b/tests/templates/kuttl/smoke/test_prometheus_metrics.py deleted file mode 100644 index fb19d908..00000000 --- a/tests/templates/kuttl/smoke/test_prometheus_metrics.py +++ /dev/null @@ -1,109 +0,0 @@ -# Fetch metrics from the built-in Prometheus endpoint of HDFS components. - -import logging -import sys - -import requests - - -def check_metrics( - namespace: str, role: str, port: int, expected_metrics: list[str] -) -> None: - response: requests.Response = requests.get( - f"http://hdfs-{role}-default-0.hdfs-{role}-default.{namespace}.svc.cluster.local:{port}/prom", - timeout=10, - ) - assert response.ok, "Requesting metrics failed" - - # Split the response into lines to check for metric names at the beginning of each line. - # This is a bit slower than using a regex but it allows to use special characters like "{}" in metric names - # without needing to escape them. - response_lines = response.text.splitlines() - for metric in expected_metrics: - # Use any() with a generator to stop early if the metric is found. - assert any((line.startswith(metric) for line in response_lines)) is True, ( - f"Metric '{metric}' not found for {role}" - ) - - -def check_namenode_metrics( - namespace: str, - product_version: str, -) -> None: - expected_metrics: list[str] = [ - # Kind "MetricsSystem" - 'metrics_system_num_active_sources{context="metricssystem",hostname="hdfs-namenode-default-0"}', - # Counter suffixed with "_total" - # The metric attributes can change so we remove them from the expected metric. - # The full name looks like: 'fs_namesystem_files_total{context="dfs",enabledecpolicies="RS-6-3-1024k",hastate="active",totalsynctimes="4 7 ",hostname="hdfs-namenode-default-0"}', - "fs_namesystem_files_total", - # Metric suffixed with "_created" - 'namenode_files_created{processname="NameNode",sessionid="null",context="dfs",hostname="hdfs-namenode-default-0"}', - # Boolean metric - # 'hadoop_namenode_security_enabled{kind="NameNodeStatus",role="NameNode",service="HDFS"}', - # Non-special metric - 'namenode_files_deleted{processname="NameNode",sessionid="null",context="dfs",hostname="hdfs-namenode-default-0"}', - ] - - check_metrics(namespace, "namenode", 9870, expected_metrics) - - -def check_datanode_metrics( - namespace: str, - product_version: str, -) -> None: - expected_metrics: list[str] = [ - # Kind "MetricsSystem" - 'metrics_system_num_active_sources{context="metricssystem",hostname="hdfs-datanode-default-0"}', - # Kind "FSDatasetState" suffixed with "_total" - # 'org_apache_hadoop_hdfs_server_datanode_fsdataset_impl_fs_dataset_impl_estimated_capacity_lost_total{context="FSDatasetState",storageinfo="FSDataset{dirpath=\'[/stackable/data/hdd/datanode,/stackable/data/hdd-1/datanode, /stackable/data/ssd/datanode]\'}",hostname="hdfs-datanode-default-0"}', - "org_apache_hadoop_hdfs_server_datanode_fsdataset_impl_fs_dataset_impl_estimated_capacity_lost_total", - # Kind "FSDatasetState" - # 'org_apache_hadoop_hdfs_server_datanode_fsdataset_impl_fs_dataset_impl_capacity{context="FSDatasetState",storageinfo="FSDataset{dirpath=\'[/stackable/data/hdd/datanode, /stackable/data/hdd-1/datanode, /stackable/data/ssd/datanode]\'}",hostname="hdfs-datanode-default-0"}', - "org_apache_hadoop_hdfs_server_datanode_fsdataset_impl_fs_dataset_impl_capacity", - # Kind "DataNodeActivity" suffixed with "_info" - 'datanode_blocks_get_local_path_info{sessionid="null",context="dfs",hostname="hdfs-datanode-default-0"}', - # Kind "DataNodeActivity" - 'datanode_blocks_read{sessionid="null",context="dfs",hostname="hdfs-datanode-default-0"}', - # Counter suffixed with "_total" - # 'org_apache_hadoop_hdfs_server_datanode_fsdataset_impl_fs_dataset_impl_estimated_capacity_lost_total{context="FSDatasetState",storageinfo="FSDataset{dirpath=\'[/stackable/data/hdd/datanode,/stackable/data/hdd-1/datanode, /stackable/data/ssd/datanode]\'}",hostname="hdfs-datanode-default-0"}', - "org_apache_hadoop_hdfs_server_datanode_fsdataset_impl_fs_dataset_impl_estimated_capacity_lost_total", - # Boolean metric - #'hadoop_datanode_security_enabled{kind="DataNodeInfo",role="DataNode",service="HDFS"}', - # Non-special metric - 'jvm_metrics_gc_count{context="jvm",processname="DataNode",sessionid="null",hostname="hdfs-datanode-default-0"}', - ] - - check_metrics(namespace, "datanode", 9864, expected_metrics) - - -def check_journalnode_metrics( - namespace: str, - product_version: str, -) -> None: - expected_metrics: list[str] = [ - # Kind "MetricsSystem" - 'metrics_system_num_active_sources{context="metricssystem",hostname="hdfs-journalnode-default-0"}', - # Non-special metric - 'journal_node_bytes_written{context="dfs",journalid="hdfs",hostname="hdfs-journalnode-default-0"}', - # There is no boolean metric in JournalNode. - ] - - check_metrics(namespace, "journalnode", 8480, expected_metrics) - - -if __name__ == "__main__": - namespace_arg: str = sys.argv[1] - product_version_arg: str = sys.argv[2] - - logging.basicConfig( - level="DEBUG", - format="%(asctime)s %(levelname)s: %(message)s", - stream=sys.stdout, - ) - - check_namenode_metrics(namespace_arg, product_version_arg) - check_datanode_metrics(namespace_arg, product_version_arg) - check_journalnode_metrics(namespace_arg, product_version_arg) - - print("All expected metrics found") diff --git a/tests/templates/kuttl/smoke/webhdfs.py b/tests/templates/kuttl/smoke/webhdfs.py index d7bb4c3f..b0ccb40c 100755 --- a/tests/templates/kuttl/smoke/webhdfs.py +++ b/tests/templates/kuttl/smoke/webhdfs.py @@ -17,7 +17,7 @@ def main() -> int: if command == "ls": http_code = requests.get( - f"http://hdfs-namenode-default-0.hdfs-namenode-default.{namespace}.svc.cluster.local:9870/webhdfs/v1/testdata.txt?user.name=stackable&op=LISTSTATUS" + f"http://hdfs-namenode-default-0.hdfs-namenode-default-headless.{namespace}.svc.cluster.local:9870/webhdfs/v1/testdata.txt?user.name=stackable&op=LISTSTATUS" ).status_code if http_code != 200: result = 1 @@ -31,7 +31,7 @@ def main() -> int: ) } http_code = requests.put( - f"http://hdfs-namenode-default-0.hdfs-namenode-default.{namespace}.svc.cluster.local:9870/webhdfs/v1/testdata.txt?user.name=stackable&op=CREATE", + f"http://hdfs-namenode-default-0.hdfs-namenode-default-headless.{namespace}.svc.cluster.local:9870/webhdfs/v1/testdata.txt?user.name=stackable&op=CREATE", files=files, allow_redirects=True, ).status_code From 8b1d4f23527f9fe50d663c1c6904522a6e4fb1f7 Mon Sep 17 00:00:00 2001 From: Malte Sander Date: Thu, 23 Oct 2025 12:59:10 +0200 Subject: [PATCH 05/18] fix smoke tests --- .../templates/kuttl/smoke/test_jmx_metrics.py | 4 +- .../kuttl/smoke/test_native_metrics.py | 48 ++++++++++--------- 2 files changed, 27 insertions(+), 25 deletions(-) diff --git a/tests/templates/kuttl/smoke/test_jmx_metrics.py b/tests/templates/kuttl/smoke/test_jmx_metrics.py index 41aeef00..5e5a5753 100755 --- a/tests/templates/kuttl/smoke/test_jmx_metrics.py +++ b/tests/templates/kuttl/smoke/test_jmx_metrics.py @@ -65,9 +65,9 @@ def check_datanode_metrics( # Kind "FSDatasetState" 'hadoop_datanode_capacity{fsdatasetid=".+",kind="FSDatasetState",role="DataNode",service="HDFS"}', # Kind "DataNodeActivity" suffixed with "_info" - 'hadoop_datanode_blocks_get_local_path_info_{host="hdfs-datanode-default-0\\.hdfs-datanode-default-headless\\..+\\.svc\\.cluster\\.local",kind="DataNodeActivity",port="9866",role="DataNode",service="HDFS"}', + 'hadoop_datanode_blocks_get_local_path_info_{host="hdfs-datanode-default-\\d+\\.hdfs-datanode-default-headless\\..+\\.svc\\.cluster\\.local",kind="DataNodeActivity",port="9866",role="DataNode",service="HDFS"}', # Kind "DataNodeActivity" - 'hadoop_datanode_blocks_read{host="hdfs-datanode-default-0\\.hdfs-datanode-default-headless\\..+\\.svc\\.cluster\\.local",kind="DataNodeActivity",port="9866",role="DataNode",service="HDFS"}', + 'hadoop_datanode_blocks_read{host="hdfs-datanode-default-\\d+\\.hdfs-datanode-default-headless\\..+\\.svc\\.cluster\\.local",kind="DataNodeActivity",port="9866",role="DataNode",service="HDFS"}', # Counter suffixed with "_total" 'hadoop_datanode_estimated_capacity_lost_total{kind="FSDatasetState",role="DataNode",service="HDFS"}', # Boolean metric diff --git a/tests/templates/kuttl/smoke/test_native_metrics.py b/tests/templates/kuttl/smoke/test_native_metrics.py index 3dc5034e..cdc63f85 100755 --- a/tests/templates/kuttl/smoke/test_native_metrics.py +++ b/tests/templates/kuttl/smoke/test_native_metrics.py @@ -1,4 +1,6 @@ -# Every rule in the JMX configuration is covered by one expected metric. +# Native Prometheus metrics test +# We use a raw string for "expected_metrics" but still have to escape special regex characters "[", "]", "{" and "}" +# that we expect to be in the metrics string. import re import sys @@ -8,28 +10,28 @@ def check_metrics( - namespace: str, role: str, port: int, expected_metrics: list[str] + namespace: str, + role: str, + port: int, + expected_metrics: list[str] ) -> None: - response: requests.Response = requests.get( + response = requests.get( f"http://hdfs-{role}-default-metrics.{namespace}.svc.cluster.local:{port}/prom", timeout=10, ) - assert response.ok, "Requesting metrics failed" + assert response.ok, "Requesting metrics failed for {role}." for metric in expected_metrics: - assert re.search(f"^{metric}", response.text, re.MULTILINE) is not None, ( - f"Metric '{metric}' not found for {role}" - ) + regex = re.compile(metric, re.MULTILINE) + assert regex.search(response.text) is not None, f"Metric '{metric}' not found for {role}" -def check_namenode_metrics( - namespace: str, -) -> None: - expected_metrics: list[str] = [ - 'metrics_system_num_active_sources{context="metricssystem",hostname="hdfs-namenode-default-', - 'namenode_total_file_ops{processname="NameNode",sessionid="null",context="dfs",hostname="hdfs-namenode-default', - 'namenode_files_created{processname="NameNode",sessionid="null",context="dfs",hostname="hdfs-namenode-default-', - 'namenode_files_deleted{processname="NameNode",sessionid="null",context="dfs",hostname="hdfs-namenode-default-', +def check_namenode_metrics(namespace: str) -> None: + expected_metrics = [ + r'metrics_system_num_active_sources\{context="metricssystem",hostname="hdfs-namenode-default-\d+"\}', + r'namenode_total_file_ops\{processname="NameNode",sessionid="null",context="dfs",hostname="hdfs-namenode-default-\d+"\}', + r'namenode_files_created\{processname="NameNode",sessionid="null",context="dfs",hostname="hdfs-namenode-default-\d+"\}', + r'namenode_files_deleted\{processname="NameNode",sessionid="null",context="dfs",hostname="hdfs-namenode-default-\d+"\}', ] check_metrics(namespace, "namenode", 9870, expected_metrics) @@ -39,12 +41,12 @@ def check_datanode_metrics( namespace: str, ) -> None: expected_metrics: list[str] = [ - 'metrics_system_num_active_sources{context="metricssystem",hostname="hdfs-datanode-default-0"}', - 'org_apache_hadoop_hdfs_server_datanode_fsdataset_impl_fs_dataset_impl_capacity{context="FSDatasetState",storageinfo="FSDataset{dirpath=\'[/stackable/data/data/datanode]\'}",hostname="hdfs-datanode-default-0"}', - 'org_apache_hadoop_hdfs_server_datanode_fsdataset_impl_fs_dataset_impl_estimated_capacity_lost_total{context="FSDatasetState",storageinfo="FSDataset{dirpath=\'[/stackable/data/data/datanode]\'}",hostname="hdfs-datanode-default-0"}', - 'datanode_blocks_get_local_path_info{sessionid="null",context="dfs",hostname="hdfs-datanode-default-0"}', - 'datanode_blocks_read{sessionid="null",context="dfs",hostname="hdfs-datanode-default-0"}', - 'jvm_metrics_gc_count{context="jvm",processname="DataNode",sessionid="null",hostname="hdfs-datanode-default-0"}', + r'metrics_system_num_active_sources\{context="metricssystem",hostname="hdfs-datanode-default-\d+', + r'org_apache_hadoop_hdfs_server_datanode_fsdataset_impl_fs_dataset_impl_capacity\{context="FSDatasetState",storageinfo="FSDataset\{dirpath=\'\[/stackable/data/data/datanode]\'\}",hostname="hdfs-datanode-default-\d+"\}', + r'org_apache_hadoop_hdfs_server_datanode_fsdataset_impl_fs_dataset_impl_estimated_capacity_lost_total\{context="FSDatasetState",storageinfo="FSDataset\{dirpath=\'\[/stackable/data/data/datanode]\'\}",hostname="hdfs-datanode-default-\d+"\}', + r'datanode_blocks_get_local_path_info\{sessionid="null",context="dfs",hostname="hdfs-datanode-default-\d+"\}', + r'datanode_blocks_read\{sessionid="null",context="dfs",hostname="hdfs-datanode-default-\d+"\}', + r'jvm_metrics_gc_count\{context="jvm",processname="DataNode",sessionid="null",hostname="hdfs-datanode-default-\d+"\}', ] check_metrics(namespace, "datanode", 9864, expected_metrics) @@ -54,8 +56,8 @@ def check_journalnode_metrics( namespace: str, ) -> None: expected_metrics: list[str] = [ - 'metrics_system_num_active_sources{context="metricssystem",hostname="hdfs-journalnode-default-0"}', - 'journal_node_bytes_written{context="dfs",journalid="hdfs",hostname="hdfs-journalnode-default-0"}', + r'metrics_system_num_active_sources\{context="metricssystem",hostname="hdfs-journalnode-default-\d+"\}', + r'journal_node_bytes_written\{context="dfs",journalid="hdfs",hostname="hdfs-journalnode-default-\d+"\}', ] check_metrics(namespace, "journalnode", 8480, expected_metrics) From f51394f768c519458f90ad363a4318139b00ef83 Mon Sep 17 00:00:00 2001 From: Malte Sander Date: Thu, 23 Oct 2025 16:37:12 +0200 Subject: [PATCH 06/18] fix smoke tests scripts with regex --- tests/templates/kuttl/smoke/test_jmx_metrics.py | 2 +- tests/templates/kuttl/smoke/test_native_metrics.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/templates/kuttl/smoke/test_jmx_metrics.py b/tests/templates/kuttl/smoke/test_jmx_metrics.py index 5e5a5753..f88437c4 100755 --- a/tests/templates/kuttl/smoke/test_jmx_metrics.py +++ b/tests/templates/kuttl/smoke/test_jmx_metrics.py @@ -126,4 +126,4 @@ def check_journalnode_metrics( check_datanode_metrics(namespace_arg, product_version_arg) check_journalnode_metrics(namespace_arg, product_version_arg) - print("All expected metrics found") + print("All expected JMX metrics found") diff --git a/tests/templates/kuttl/smoke/test_native_metrics.py b/tests/templates/kuttl/smoke/test_native_metrics.py index cdc63f85..e634fb7f 100755 --- a/tests/templates/kuttl/smoke/test_native_metrics.py +++ b/tests/templates/kuttl/smoke/test_native_metrics.py @@ -76,4 +76,4 @@ def check_journalnode_metrics( check_datanode_metrics(namespace_arg) check_journalnode_metrics(namespace_arg) - print("All expected metrics found") + print("All expected native metrics found") From f5689632f98620c800225def939e8d097affe9b1 Mon Sep 17 00:00:00 2001 From: Malte Sander Date: Thu, 23 Oct 2025 18:38:40 +0200 Subject: [PATCH 07/18] fix missing tests --- tests/templates/kuttl/profiling/run-profiler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/templates/kuttl/profiling/run-profiler.py b/tests/templates/kuttl/profiling/run-profiler.py index 56727c63..633fb664 100644 --- a/tests/templates/kuttl/profiling/run-profiler.py +++ b/tests/templates/kuttl/profiling/run-profiler.py @@ -56,7 +56,7 @@ def fetch_flamegraph(service_url, refresh_path): def test_profiling(role, port): service_url = ( - f"http://test-hdfs-{role}-default-0.test-hdfs-{role}-default" f":{port}" + f"http://test-hdfs-{role}-default-0.test-hdfs-{role}-default-headless" f":{port}" ) print(f"Test profiling on {service_url}") From 7d74a8348e2b7ec6a62700a97b96857382b725c9 Mon Sep 17 00:00:00 2001 From: Malte Sander Date: Fri, 24 Oct 2025 08:44:24 +0200 Subject: [PATCH 08/18] fix scopes and visibility --- rust/operator-binary/src/container.rs | 2 +- rust/operator-binary/src/crd/mod.rs | 147 ++++++++++++-------------- 2 files changed, 71 insertions(+), 78 deletions(-) diff --git a/rust/operator-binary/src/container.rs b/rust/operator-binary/src/container.rs index f13ad8a1..58c03319 100644 --- a/rust/operator-binary/src/container.rs +++ b/rust/operator-binary/src/container.rs @@ -68,7 +68,7 @@ use crate::{ NAMENODE_ROOT_DATA_DIR, READINESS_PROBE_FAILURE_THRESHOLD, READINESS_PROBE_INITIAL_DELAY_SECONDS, READINESS_PROBE_PERIOD_SECONDS, SERVICE_PORT_NAME_HTTP, SERVICE_PORT_NAME_HTTPS, SERVICE_PORT_NAME_IPC, - SERVICE_PORT_NAME_METRICS, SERVICE_PORT_NAME_RPC, STACKABLE_ROOT_DATA_DIR, + SERVICE_PORT_NAME_RPC, STACKABLE_ROOT_DATA_DIR, }, storage::DataNodeStorageConfig, v1alpha1, diff --git a/rust/operator-binary/src/crd/mod.rs b/rust/operator-binary/src/crd/mod.rs index bd3674bf..b7d7a397 100644 --- a/rust/operator-binary/src/crd/mod.rs +++ b/rust/operator-binary/src/crd/mod.rs @@ -671,8 +671,75 @@ impl v1alpha1::HdfsCluster { .sum() } + pub fn native_metrics_port(&self, role: &HdfsNodeRole) -> u16 { + match role { + HdfsNodeRole::Name => { + if self.has_https_enabled() { + DEFAULT_NAME_NODE_NATIVE_METRICS_HTTPS_PORT + } else { + DEFAULT_NAME_NODE_NATIVE_METRICS_HTTP_PORT + } + } + HdfsNodeRole::Data => { + if self.has_https_enabled() { + DEFAULT_DATA_NODE_NATIVE_METRICS_HTTPS_PORT + } else { + DEFAULT_DATA_NODE_NATIVE_METRICS_HTTP_PORT + } + } + HdfsNodeRole::Journal => { + if self.has_https_enabled() { + DEFAULT_JOURNAL_NODE_NATIVE_METRICS_HTTPS_PORT + } else { + DEFAULT_JOURNAL_NODE_NATIVE_METRICS_HTTP_PORT + } + } + } + } + + /// Deprecated required JMX metrics port name and metrics port number tuples depending on the role. + pub fn jmx_metrics_ports(&self, role: &HdfsNodeRole) -> Vec<(String, u16)> { + match role { + HdfsNodeRole::Name => vec![( + String::from(SERVICE_PORT_NAME_JMX_METRICS), + DEFAULT_NAME_NODE_METRICS_PORT, + )], + HdfsNodeRole::Data => vec![( + String::from(SERVICE_PORT_NAME_JMX_METRICS), + DEFAULT_DATA_NODE_METRICS_PORT, + )], + HdfsNodeRole::Journal => vec![( + String::from(SERVICE_PORT_NAME_JMX_METRICS), + DEFAULT_JOURNAL_NODE_METRICS_PORT, + )], + } + } + + pub fn metrics_service_ports(&self, role: &HdfsNodeRole) -> Vec<(String, u16)> { + let mut metrics_service_ports = vec![]; + // "native" ports + metrics_service_ports.extend(self.native_metrics_ports(role)); + // deprecated jmx ports + metrics_service_ports.extend(self.jmx_metrics_ports(role)); + metrics_service_ports + } + + pub fn headless_service_ports(&self, role: &HdfsNodeRole) -> Vec<(String, u16)> { + let mut headless_service_ports = vec![]; + headless_service_ports.extend(self.data_ports(role)); + headless_service_ports + } + + pub fn hdfs_main_container_ports(&self, role: &HdfsNodeRole) -> Vec<(String, u16)> { + let mut main_container_ports = vec![]; + main_container_ports.extend(self.data_ports(role)); + // TODO: This will be exposed in the listener if added to container ports? + // main_container_ports.extend(self.jmx_metrics_ports(role)); + main_container_ports + } + /// Returns required port name and port number tuples depending on the role. - pub fn data_ports(&self, role: &HdfsNodeRole) -> Vec<(String, u16)> { + fn data_ports(&self, role: &HdfsNodeRole) -> Vec<(String, u16)> { match role { HdfsNodeRole::Name => vec![ ( @@ -732,26 +799,8 @@ impl v1alpha1::HdfsCluster { } } - /// Deprecated required JMX metrics port name and metrics port number tuples depending on the role. - pub fn jmx_metrics_ports(&self, role: &HdfsNodeRole) -> Vec<(String, u16)> { - match role { - HdfsNodeRole::Name => vec![( - String::from(SERVICE_PORT_NAME_JMX_METRICS), - DEFAULT_NAME_NODE_METRICS_PORT, - )], - HdfsNodeRole::Data => vec![( - String::from(SERVICE_PORT_NAME_JMX_METRICS), - DEFAULT_DATA_NODE_METRICS_PORT, - )], - HdfsNodeRole::Journal => vec![( - String::from(SERVICE_PORT_NAME_JMX_METRICS), - DEFAULT_JOURNAL_NODE_METRICS_PORT, - )], - } - } - - /// Returns required metrics port name and metrics port number tuples depending on the role and security settings. - pub fn metrics_ports(&self, role: &HdfsNodeRole) -> Vec<(String, u16)> { + /// Returns required native metrics port name and metrics port number tuples depending on the role and security settings. + fn native_metrics_ports(&self, role: &HdfsNodeRole) -> Vec<(String, u16)> { match role { HdfsNodeRole::Name => vec![if self.has_https_enabled() { ( @@ -788,62 +837,6 @@ impl v1alpha1::HdfsCluster { }], } } - - pub fn metrics_port(&self, role: &HdfsNodeRole) -> u16 { - match role { - HdfsNodeRole::Name => DEFAULT_NAME_NODE_METRICS_PORT, - HdfsNodeRole::Data => DEFAULT_DATA_NODE_METRICS_PORT, - HdfsNodeRole::Journal => DEFAULT_JOURNAL_NODE_METRICS_PORT, - } - } - - pub fn native_metrics_port(&self, role: &HdfsNodeRole) -> u16 { - match role { - HdfsNodeRole::Name => { - if self.has_https_enabled() { - DEFAULT_NAME_NODE_NATIVE_METRICS_HTTPS_PORT - } else { - DEFAULT_NAME_NODE_NATIVE_METRICS_HTTP_PORT - } - } - HdfsNodeRole::Data => { - if self.has_https_enabled() { - DEFAULT_DATA_NODE_NATIVE_METRICS_HTTPS_PORT - } else { - DEFAULT_DATA_NODE_NATIVE_METRICS_HTTP_PORT - } - } - HdfsNodeRole::Journal => { - if self.has_https_enabled() { - DEFAULT_JOURNAL_NODE_NATIVE_METRICS_HTTPS_PORT - } else { - DEFAULT_JOURNAL_NODE_NATIVE_METRICS_HTTP_PORT - } - } - } - } - - pub fn metrics_service_ports(&self, role: &HdfsNodeRole) -> Vec<(String, u16)> { - let mut metrics_service_ports = vec![]; - // "native" ports - metrics_service_ports.extend(self.metrics_ports(role)); - metrics_service_ports.extend(self.jmx_metrics_ports(role)); - metrics_service_ports - } - - pub fn headless_service_ports(&self, role: &HdfsNodeRole) -> Vec<(String, u16)> { - let mut headless_service_ports = vec![]; - headless_service_ports.extend(self.data_ports(role)); - headless_service_ports - } - - pub fn hdfs_main_container_ports(&self, role: &HdfsNodeRole) -> Vec<(String, u16)> { - let mut main_container_ports = vec![]; - main_container_ports.extend(self.data_ports(role)); - // TODO: This will be exposed in the listener if added to container ports? - // main_container_ports.extend(self.jmx_metrics_ports(role)); - main_container_ports - } } #[derive(Clone, Debug, Deserialize, Eq, Hash, JsonSchema, PartialEq, Serialize)] From 3d9ffe09ea310507ae9dde392744ee1fb5fb9710 Mon Sep 17 00:00:00 2001 From: Malte Sander Date: Mon, 27 Oct 2025 07:36:07 +0100 Subject: [PATCH 09/18] adapted docs --- .../hdfs/pages/usage-guide/monitoring.adoc | 29 +++++++++++++++---- 1 file changed, 23 insertions(+), 6 deletions(-) diff --git a/docs/modules/hdfs/pages/usage-guide/monitoring.adoc b/docs/modules/hdfs/pages/usage-guide/monitoring.adoc index 53c52956..7f92ea29 100644 --- a/docs/modules/hdfs/pages/usage-guide/monitoring.adoc +++ b/docs/modules/hdfs/pages/usage-guide/monitoring.adoc @@ -1,17 +1,34 @@ = Monitoring -:description: The HDFS cluster can be monitored with Prometheus from inside or outside the K8S cluster. +:description: The HDFS cluster is automatically configured to export Prometheus metrics. The cluster can be monitored with Prometheus from inside or outside the K8S cluster. -All services (with the exception of the Zookeeper daemon on the node names) run with the JMX exporter agent enabled and expose metrics on the `metrics` port. -This port is available from the container level up to the NodePort services. +The managed HDFS stacklets are automatically configured to export Prometheus metrics. +See xref:operators:monitoring.adoc[] for more details. [IMPORTANT] ==== -Starting with Stackable Data Platform 25.7, the built-in Prometheus metrics are also available at the `/prom` endpoint of all the UI services. +Starting with Stackable Data Platform 25.7, the built-in Prometheus metrics are available at the `/prom` endpoint of all the UI services. The JMX exporter metrics are now deprecated and will be removed in a future release. ==== -The metrics endpoints are also used as liveliness probes by Kubernetes. +This endpoint, in the case of the Namenode service, is reachable via the the `metrics` service: +[source,shell] +---- +http://-namenode--metrics:9870/prom +---- -See xref:operators:monitoring.adoc[] for more details. +== Authentication when using TLS + +HDFS exposes metrics through the same port as their web UI. Hence, when configuring HDFS with TLS the metrics are also secured by TLS, +and the clients scraping the metrics endpoint need to authenticate against it. This could for example be accomplished by utilizing mTLS +between Kubernetes Pods with the xref:home:secret-operator:index.adoc[Secret Operator]. + +When using the Prometheus `ServiceMonitor` for scraping, the `address` label needs relabeling to use the `headless` Service instead of the +`metrics` Service. This is because per default Prometheus targets the Pod IPs as endpoints, but since the Pod IPs are not +part of the certificate, the authentication will fail. Instead, the FQDN of the Pods, which can be added to the certificate, is used, but +this FQDN is only available through the `headless` Service. + +A more detailed explanation can be found in the xref:home:nifi:usage_guide/monitoring.adoc[NiFi Operator Monitoring Docs] with a similar situation +and an example of a Prometheus `ServiceMonitor` configured for TLS in the +https://github.com/stackabletech/demos/blob/main/stacks/monitoring/prometheus-service-monitors.yaml[Monitoring Stack{external-link-icon}^]. From be50ba3332e8435dd3a8e2adb6dcc00c598672a6 Mon Sep 17 00:00:00 2001 From: Malte Sander Date: Mon, 27 Oct 2025 07:49:30 +0100 Subject: [PATCH 10/18] run precommit --- .../hdfs/pages/usage-guide/monitoring.adoc | 2 +- .../kuttl/logging/test_log_aggregation.py | 18 +++++++++--------- .../templates/kuttl/profiling/run-profiler.py | 5 ++--- .../kuttl/smoke/test_native_metrics.py | 9 ++++----- 4 files changed, 16 insertions(+), 18 deletions(-) diff --git a/docs/modules/hdfs/pages/usage-guide/monitoring.adoc b/docs/modules/hdfs/pages/usage-guide/monitoring.adoc index 7f92ea29..58230610 100644 --- a/docs/modules/hdfs/pages/usage-guide/monitoring.adoc +++ b/docs/modules/hdfs/pages/usage-guide/monitoring.adoc @@ -12,7 +12,7 @@ Starting with Stackable Data Platform 25.7, the built-in Prometheus metrics are The JMX exporter metrics are now deprecated and will be removed in a future release. ==== -This endpoint, in the case of the Namenode service, is reachable via the the `metrics` service: +This endpoint, in the case of the Namenode service, is reachable via the the `metrics` service: [source,shell] ---- http://-namenode--metrics:9870/prom diff --git a/tests/templates/kuttl/logging/test_log_aggregation.py b/tests/templates/kuttl/logging/test_log_aggregation.py index 561eff2c..845b3814 100755 --- a/tests/templates/kuttl/logging/test_log_aggregation.py +++ b/tests/templates/kuttl/logging/test_log_aggregation.py @@ -23,9 +23,9 @@ def check_sent_events(): }, ) - assert ( - response.status_code == 200 - ), "Cannot access the API of the vector aggregator." + assert response.status_code == 200, ( + "Cannot access the API of the vector aggregator." + ) result = response.json() @@ -35,13 +35,13 @@ def check_sent_events(): componentId = transform["componentId"] if componentId == "filteredInvalidEvents": - assert ( - sentEvents is None or sentEvents["sentEventsTotal"] == 0 - ), "Invalid log events were sent." + assert sentEvents is None or sentEvents["sentEventsTotal"] == 0, ( + "Invalid log events were sent." + ) else: - assert ( - sentEvents is not None and sentEvents["sentEventsTotal"] > 0 - ), f'No events were sent in "{componentId}".' + assert sentEvents is not None and sentEvents["sentEventsTotal"] > 0, ( + f'No events were sent in "{componentId}".' + ) if __name__ == "__main__": diff --git a/tests/templates/kuttl/profiling/run-profiler.py b/tests/templates/kuttl/profiling/run-profiler.py index 633fb664..ad46a3de 100644 --- a/tests/templates/kuttl/profiling/run-profiler.py +++ b/tests/templates/kuttl/profiling/run-profiler.py @@ -8,8 +8,7 @@ def start_profiling_and_get_refresh_header(service_url): prof_page = requests.get( - f"{service_url}/prof" - f"?event={EVENT_TYPE}&duration={PROFILING_DURATION_IN_SEC}" + f"{service_url}/prof?event={EVENT_TYPE}&duration={PROFILING_DURATION_IN_SEC}" ) assert prof_page.ok, f"""Profiling could not be started. @@ -56,7 +55,7 @@ def fetch_flamegraph(service_url, refresh_path): def test_profiling(role, port): service_url = ( - f"http://test-hdfs-{role}-default-0.test-hdfs-{role}-default-headless" f":{port}" + f"http://test-hdfs-{role}-default-0.test-hdfs-{role}-default-headless:{port}" ) print(f"Test profiling on {service_url}") diff --git a/tests/templates/kuttl/smoke/test_native_metrics.py b/tests/templates/kuttl/smoke/test_native_metrics.py index e634fb7f..c7138ec1 100755 --- a/tests/templates/kuttl/smoke/test_native_metrics.py +++ b/tests/templates/kuttl/smoke/test_native_metrics.py @@ -10,10 +10,7 @@ def check_metrics( - namespace: str, - role: str, - port: int, - expected_metrics: list[str] + namespace: str, role: str, port: int, expected_metrics: list[str] ) -> None: response = requests.get( f"http://hdfs-{role}-default-metrics.{namespace}.svc.cluster.local:{port}/prom", @@ -23,7 +20,9 @@ def check_metrics( for metric in expected_metrics: regex = re.compile(metric, re.MULTILINE) - assert regex.search(response.text) is not None, f"Metric '{metric}' not found for {role}" + assert regex.search(response.text) is not None, ( + f"Metric '{metric}' not found for {role}" + ) def check_namenode_metrics(namespace: str) -> None: From 7a5fd2a70887f1283ae9dbc618c43db2a5134884 Mon Sep 17 00:00:00 2001 From: Malte Sander Date: Mon, 27 Oct 2025 09:23:22 +0100 Subject: [PATCH 11/18] add metrics service to tls cert --- .../hdfs/pages/usage-guide/monitoring.adoc | 9 ------- rust/operator-binary/src/container.rs | 24 +++++++++++-------- rust/operator-binary/src/hdfs_controller.rs | 7 +++--- 3 files changed, 17 insertions(+), 23 deletions(-) diff --git a/docs/modules/hdfs/pages/usage-guide/monitoring.adoc b/docs/modules/hdfs/pages/usage-guide/monitoring.adoc index 58230610..9c6d5eee 100644 --- a/docs/modules/hdfs/pages/usage-guide/monitoring.adoc +++ b/docs/modules/hdfs/pages/usage-guide/monitoring.adoc @@ -23,12 +23,3 @@ http://-namenode--metrics:9870/prom HDFS exposes metrics through the same port as their web UI. Hence, when configuring HDFS with TLS the metrics are also secured by TLS, and the clients scraping the metrics endpoint need to authenticate against it. This could for example be accomplished by utilizing mTLS between Kubernetes Pods with the xref:home:secret-operator:index.adoc[Secret Operator]. - -When using the Prometheus `ServiceMonitor` for scraping, the `address` label needs relabeling to use the `headless` Service instead of the -`metrics` Service. This is because per default Prometheus targets the Pod IPs as endpoints, but since the Pod IPs are not -part of the certificate, the authentication will fail. Instead, the FQDN of the Pods, which can be added to the certificate, is used, but -this FQDN is only available through the `headless` Service. - -A more detailed explanation can be found in the xref:home:nifi:usage_guide/monitoring.adoc[NiFi Operator Monitoring Docs] with a similar situation -and an example of a Prometheus `ServiceMonitor` configured for TLS in the -https://github.com/stackabletech/demos/blob/main/stacks/monitoring/prometheus-service-monitors.yaml[Monitoring Stack{external-link-icon}^]. diff --git a/rust/operator-binary/src/container.rs b/rust/operator-binary/src/container.rs index 58c03319..8f6181de 100644 --- a/rust/operator-binary/src/container.rs +++ b/rust/operator-binary/src/container.rs @@ -48,6 +48,7 @@ use stackable_operator::{ CustomContainerLogConfig, }, }, + role_utils::RoleGroupRef, utils::{COMMON_BASH_TRAP_FUNCTIONS, cluster_info::KubernetesClusterInfo}, }; use strum::{Display, EnumDiscriminants, IntoStaticStr}; @@ -216,24 +217,25 @@ impl ContainerConfig { hdfs: &v1alpha1::HdfsCluster, cluster_info: &KubernetesClusterInfo, role: &HdfsNodeRole, - role_group: &str, + rolegroup_ref: &RoleGroupRef, resolved_product_image: &ResolvedProductImage, merged_config: &AnyNodeConfig, env_overrides: Option<&BTreeMap>, zk_config_map_name: &str, - object_name: &str, namenode_podrefs: &[HdfsPodRef], labels: &Labels, ) -> Result<(), Error> { // HDFS main container let main_container_config = Self::from(*role); - pb.add_volumes(main_container_config.volumes(merged_config, object_name, labels)?) + let object_name = &rolegroup_ref.object_name(); + + pb.add_volumes(main_container_config.volumes(merged_config, &object_name, labels)?) .context(AddVolumeSnafu)?; pb.add_container(main_container_config.main_container( hdfs, cluster_info, role, - role_group, + rolegroup_ref, resolved_product_image, zk_config_map_name, env_overrides, @@ -277,6 +279,8 @@ impl ContainerConfig { ) .with_pod_scope() .with_node_scope() + // To scrape metrics behind TLS endpoint (without FQDN) + .with_service_scope(rolegroup_ref.rolegroup_metrics_service_name()) .with_format(SecretFormat::TlsPkcs12) .with_tls_pkcs12_password(TLS_STORE_PASSWORD) .with_auto_tls_cert_lifetime( @@ -327,7 +331,7 @@ impl ContainerConfig { hdfs, cluster_info, role, - role_group, + &rolegroup_ref, resolved_product_image, zk_config_map_name, env_overrides, @@ -348,7 +352,7 @@ impl ContainerConfig { hdfs, cluster_info, role, - role_group, + &rolegroup_ref.role_group, resolved_product_image, zk_config_map_name, env_overrides, @@ -370,7 +374,7 @@ impl ContainerConfig { hdfs, cluster_info, role, - role_group, + &rolegroup_ref.role_group, resolved_product_image, zk_config_map_name, env_overrides, @@ -393,7 +397,7 @@ impl ContainerConfig { hdfs, cluster_info, role, - role_group, + &rolegroup_ref.role_group, resolved_product_image, zk_config_map_name, env_overrides, @@ -462,7 +466,7 @@ impl ContainerConfig { hdfs: &v1alpha1::HdfsCluster, cluster_info: &KubernetesClusterInfo, role: &HdfsNodeRole, - role_group: &str, + rolegroup_ref: &RoleGroupRef, resolved_product_image: &ResolvedProductImage, zookeeper_config_map_name: &str, env_overrides: Option<&BTreeMap>, @@ -481,7 +485,7 @@ impl ContainerConfig { .args(self.args(hdfs, cluster_info, role, merged_config, &[])?) .add_env_vars(self.env( hdfs, - role_group, + &rolegroup_ref.role_group, zookeeper_config_map_name, env_overrides, resources.as_ref(), diff --git a/rust/operator-binary/src/hdfs_controller.rs b/rust/operator-binary/src/hdfs_controller.rs index f1ab314a..4dc2109c 100644 --- a/rust/operator-binary/src/hdfs_controller.rs +++ b/rust/operator-binary/src/hdfs_controller.rs @@ -853,12 +853,11 @@ fn rolegroup_statefulset( hdfs, cluster_info, role, - &rolegroup_ref.role_group, + &rolegroup_ref, resolved_product_image, merged_config, env_overrides, &hdfs.spec.cluster_config.zookeeper_config_map_name, - &rolegroup_ref.object_name(), namenode_podrefs, &rolegroup_selector_labels, ) @@ -979,6 +978,7 @@ properties: [] .unwrap() .get("default") .unwrap(); + let rolegroup_ref = hdfs.rolegroup_ref(&role.to_string(), "default"); let env_overrides = rolegroup_config.get(&PropertyNameKind::Env); let merged_config = role.merged_config(&hdfs, "default").unwrap(); @@ -997,12 +997,11 @@ properties: [] cluster_domain: DomainName::try_from("cluster.local").unwrap(), }, &role, - "default", + &rolegroup_ref, &resolved_product_image, &merged_config, env_overrides, &hdfs.spec.cluster_config.zookeeper_config_map_name, - "todo", &[], &Labels::new(), ) From ea06b9060332e6ea0e82da3a0ec3d83d4d1ae1e3 Mon Sep 17 00:00:00 2001 From: Malte Sander Date: Mon, 27 Oct 2025 09:27:47 +0100 Subject: [PATCH 12/18] clippy --- rust/operator-binary/src/container.rs | 12 ++++++------ rust/operator-binary/src/hdfs_controller.rs | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/rust/operator-binary/src/container.rs b/rust/operator-binary/src/container.rs index 8f6181de..b09be984 100644 --- a/rust/operator-binary/src/container.rs +++ b/rust/operator-binary/src/container.rs @@ -227,7 +227,7 @@ impl ContainerConfig { ) -> Result<(), Error> { // HDFS main container let main_container_config = Self::from(*role); - let object_name = &rolegroup_ref.object_name(); + let object_name = rolegroup_ref.object_name(); pb.add_volumes(main_container_config.volumes(merged_config, &object_name, labels)?) .context(AddVolumeSnafu)?; @@ -323,7 +323,7 @@ impl ContainerConfig { let zkfc_container_config = Self::try_from(NameNodeContainer::Zkfc.to_string())?; pb.add_volumes(zkfc_container_config.volumes( merged_config, - object_name, + &object_name, labels, )?) .context(AddVolumeSnafu)?; @@ -331,7 +331,7 @@ impl ContainerConfig { hdfs, cluster_info, role, - &rolegroup_ref, + rolegroup_ref, resolved_product_image, zk_config_map_name, env_overrides, @@ -344,7 +344,7 @@ impl ContainerConfig { Self::try_from(NameNodeContainer::FormatNameNodes.to_string())?; pb.add_volumes(format_namenodes_container_config.volumes( merged_config, - object_name, + &object_name, labels, )?) .context(AddVolumeSnafu)?; @@ -366,7 +366,7 @@ impl ContainerConfig { Self::try_from(NameNodeContainer::FormatZooKeeper.to_string())?; pb.add_volumes(format_zookeeper_container_config.volumes( merged_config, - object_name, + &object_name, labels, )?) .context(AddVolumeSnafu)?; @@ -389,7 +389,7 @@ impl ContainerConfig { Self::try_from(DataNodeContainer::WaitForNameNodes.to_string())?; pb.add_volumes(wait_for_namenodes_container_config.volumes( merged_config, - object_name, + &object_name, labels, )?) .context(AddVolumeSnafu)?; diff --git a/rust/operator-binary/src/hdfs_controller.rs b/rust/operator-binary/src/hdfs_controller.rs index 4dc2109c..10990c54 100644 --- a/rust/operator-binary/src/hdfs_controller.rs +++ b/rust/operator-binary/src/hdfs_controller.rs @@ -853,7 +853,7 @@ fn rolegroup_statefulset( hdfs, cluster_info, role, - &rolegroup_ref, + rolegroup_ref, resolved_product_image, merged_config, env_overrides, From 00af004f47ef0a9d262da072813adb7bdc507769 Mon Sep 17 00:00:00 2001 From: Malte Sander Date: Mon, 27 Oct 2025 09:39:18 +0100 Subject: [PATCH 13/18] clippy 2 --- rust/operator-binary/src/hdfs_controller.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rust/operator-binary/src/hdfs_controller.rs b/rust/operator-binary/src/hdfs_controller.rs index 10990c54..2b7b1e07 100644 --- a/rust/operator-binary/src/hdfs_controller.rs +++ b/rust/operator-binary/src/hdfs_controller.rs @@ -978,7 +978,7 @@ properties: [] .unwrap() .get("default") .unwrap(); - let rolegroup_ref = hdfs.rolegroup_ref(&role.to_string(), "default"); + let rolegroup_ref = hdfs.rolegroup_ref(role.to_string(), "default"); let env_overrides = rolegroup_config.get(&PropertyNameKind::Env); let merged_config = role.merged_config(&hdfs, "default").unwrap(); From 7b77186231e5b0748b9a2b60f72c9a6147f7e737 Mon Sep 17 00:00:00 2001 From: Malte Sander Date: Mon, 27 Oct 2025 11:39:26 +0100 Subject: [PATCH 14/18] remove listener class from zk in test --- tests/templates/kuttl/smoke/20-install-zk.yaml.j2 | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/templates/kuttl/smoke/20-install-zk.yaml.j2 b/tests/templates/kuttl/smoke/20-install-zk.yaml.j2 index 23fd5e7b..9785e363 100644 --- a/tests/templates/kuttl/smoke/20-install-zk.yaml.j2 +++ b/tests/templates/kuttl/smoke/20-install-zk.yaml.j2 @@ -7,9 +7,8 @@ spec: image: productVersion: "{{ test_scenario['values']['zookeeper'] }}" pullPolicy: IfNotPresent - clusterConfig: - listenerClass: {{ test_scenario['values']['listener-class'] }} {% if lookup('env', 'VECTOR_AGGREGATOR') %} + clusterConfig: vectorAggregatorConfigMapName: vector-aggregator-discovery {% endif %} servers: From dee9e6d075b45cb62b75fe49de0a55b30a50f810 Mon Sep 17 00:00:00 2001 From: Malte Sander Date: Mon, 27 Oct 2025 11:55:41 +0100 Subject: [PATCH 15/18] Apply suggestions from code review Co-authored-by: Andrew Kenworthy <1712947+adwk67@users.noreply.github.com> --- tests/templates/kuttl/smoke/30-assert.yaml.j2 | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/templates/kuttl/smoke/30-assert.yaml.j2 b/tests/templates/kuttl/smoke/30-assert.yaml.j2 index 7e8f72c6..e685aa05 100644 --- a/tests/templates/kuttl/smoke/30-assert.yaml.j2 +++ b/tests/templates/kuttl/smoke/30-assert.yaml.j2 @@ -88,9 +88,9 @@ spec: protocol: TCP targetPort: 8020 - name: http - port: 9070 + port: 9870 protocol: TCP - targetPort: 9070 + targetPort: 9870 --- apiVersion: v1 kind: Service @@ -144,7 +144,7 @@ spec: apiVersion: v1 kind: Service metadata: - name: hdfs-journal-default-headless + name: hdfs-journalnode-default-headless spec: ports: - name: rpc @@ -159,7 +159,7 @@ spec: apiVersion: v1 kind: Service metadata: - name: hdfs-journal-default-metrics + name: hdfs-journalnode-default-metrics spec: ports: - name: metrics From 3c04ec1246b7c3cab1aa08e76f24cac444a3ac75 Mon Sep 17 00:00:00 2001 From: Malte Sander Date: Mon, 27 Oct 2025 13:02:17 +0100 Subject: [PATCH 16/18] rename test script, fix multi disk smoke test --- tests/templates/kuttl/smoke/51-assert.yaml.j2 | 2 +- .../smoke/51-copy-metrics-test-script.yaml | 2 +- ...e_metrics.py => test_prometheus_metrics.py} | 18 +++++++++++++++--- 3 files changed, 17 insertions(+), 5 deletions(-) rename tests/templates/kuttl/smoke/{test_native_metrics.py => test_prometheus_metrics.py} (66%) diff --git a/tests/templates/kuttl/smoke/51-assert.yaml.j2 b/tests/templates/kuttl/smoke/51-assert.yaml.j2 index c403bd94..dcdeb801 100644 --- a/tests/templates/kuttl/smoke/51-assert.yaml.j2 +++ b/tests/templates/kuttl/smoke/51-assert.yaml.j2 @@ -12,4 +12,4 @@ commands: kubectl exec --namespace=$NAMESPACE test-runner-0 -- \ python /tmp/test_jmx_metrics.py $NAMESPACE $PRODUCT_VERSION kubectl exec --namespace=$NAMESPACE test-runner-0 -- \ - python /tmp/test_native_metrics.py $NAMESPACE + python /tmp/test_prometheus_metrics.py $NAMESPACE {{ test_scenario['values']['datanode-pvcs'] }} diff --git a/tests/templates/kuttl/smoke/51-copy-metrics-test-script.yaml b/tests/templates/kuttl/smoke/51-copy-metrics-test-script.yaml index cc94b9ff..caefdc8d 100644 --- a/tests/templates/kuttl/smoke/51-copy-metrics-test-script.yaml +++ b/tests/templates/kuttl/smoke/51-copy-metrics-test-script.yaml @@ -3,4 +3,4 @@ apiVersion: kuttl.dev/v1beta1 kind: TestStep commands: - script: kubectl cp -n $NAMESPACE ./test_jmx_metrics.py test-runner-0:/tmp - - script: kubectl cp -n $NAMESPACE ./test_native_metrics.py test-runner-0:/tmp + - script: kubectl cp -n $NAMESPACE ./test_prometheus_metrics.py test-runner-0:/tmp diff --git a/tests/templates/kuttl/smoke/test_native_metrics.py b/tests/templates/kuttl/smoke/test_prometheus_metrics.py similarity index 66% rename from tests/templates/kuttl/smoke/test_native_metrics.py rename to tests/templates/kuttl/smoke/test_prometheus_metrics.py index c7138ec1..89dd88b0 100755 --- a/tests/templates/kuttl/smoke/test_native_metrics.py +++ b/tests/templates/kuttl/smoke/test_prometheus_metrics.py @@ -38,16 +38,27 @@ def check_namenode_metrics(namespace: str) -> None: def check_datanode_metrics( namespace: str, + datanode_pvc_arg: str ) -> None: expected_metrics: list[str] = [ r'metrics_system_num_active_sources\{context="metricssystem",hostname="hdfs-datanode-default-\d+', - r'org_apache_hadoop_hdfs_server_datanode_fsdataset_impl_fs_dataset_impl_capacity\{context="FSDatasetState",storageinfo="FSDataset\{dirpath=\'\[/stackable/data/data/datanode]\'\}",hostname="hdfs-datanode-default-\d+"\}', - r'org_apache_hadoop_hdfs_server_datanode_fsdataset_impl_fs_dataset_impl_estimated_capacity_lost_total\{context="FSDatasetState",storageinfo="FSDataset\{dirpath=\'\[/stackable/data/data/datanode]\'\}",hostname="hdfs-datanode-default-\d+"\}', r'datanode_blocks_get_local_path_info\{sessionid="null",context="dfs",hostname="hdfs-datanode-default-\d+"\}', r'datanode_blocks_read\{sessionid="null",context="dfs",hostname="hdfs-datanode-default-\d+"\}', r'jvm_metrics_gc_count\{context="jvm",processname="DataNode",sessionid="null",hostname="hdfs-datanode-default-\d+"\}', ] + # metrics change depending on datanode pvcs + if datanode_pvc_arg == "default": + expected_metrics.extend([ + r'org_apache_hadoop_hdfs_server_datanode_fsdataset_impl_fs_dataset_impl_capacity\{context="FSDatasetState",storageinfo="FSDataset\{dirpath=\'\[/stackable/data/data/datanode]\'\}",hostname="hdfs-datanode-default-\d+"\}', + r'org_apache_hadoop_hdfs_server_datanode_fsdataset_impl_fs_dataset_impl_estimated_capacity_lost_total\{context="FSDatasetState",storageinfo="FSDataset\{dirpath=\'\[/stackable/data/data/datanode]\'\}",hostname="hdfs-datanode-default-\d+"\}', + ]) + else: + expected_metrics.extend([ + r'org_apache_hadoop_hdfs_server_datanode_fsdataset_impl_fs_dataset_impl_capacity\{context="FSDatasetState",storageinfo="FSDataset\{dirpath=\'\[/stackable/data/hdd/datanode, /stackable/data/hdd-1/datanode, /stackable/data/ssd/datanode]\'\}",hostname="hdfs-datanode-default-\d+"\}', + r'org_apache_hadoop_hdfs_server_datanode_fsdataset_impl_fs_dataset_impl_estimated_capacity_lost_total\{context="FSDatasetState",storageinfo="FSDataset\{dirpath=\'\[/stackable/data/hdd/datanode, /stackable/data/hdd-1/datanode, /stackable/data/ssd/datanode]\'\}",hostname="hdfs-datanode-default-\d+"\}', + ]) + check_metrics(namespace, "datanode", 9864, expected_metrics) @@ -64,6 +75,7 @@ def check_journalnode_metrics( if __name__ == "__main__": namespace_arg: str = sys.argv[1] + datanode_pvc_arg: str = sys.argv[2] logging.basicConfig( level="DEBUG", @@ -72,7 +84,7 @@ def check_journalnode_metrics( ) check_namenode_metrics(namespace_arg) - check_datanode_metrics(namespace_arg) + check_datanode_metrics(namespace_arg, datanode_pvc_arg) check_journalnode_metrics(namespace_arg) print("All expected native metrics found") From f55419aebf64cd9270a07c78ed935ab810f248dd Mon Sep 17 00:00:00 2001 From: Malte Sander Date: Mon, 27 Oct 2025 13:30:39 +0100 Subject: [PATCH 17/18] run precommit --- .../kuttl/smoke/test_prometheus_metrics.py | 27 ++++++++++--------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/tests/templates/kuttl/smoke/test_prometheus_metrics.py b/tests/templates/kuttl/smoke/test_prometheus_metrics.py index 89dd88b0..da394e84 100755 --- a/tests/templates/kuttl/smoke/test_prometheus_metrics.py +++ b/tests/templates/kuttl/smoke/test_prometheus_metrics.py @@ -36,10 +36,7 @@ def check_namenode_metrics(namespace: str) -> None: check_metrics(namespace, "namenode", 9870, expected_metrics) -def check_datanode_metrics( - namespace: str, - datanode_pvc_arg: str -) -> None: +def check_datanode_metrics(namespace: str, datanode_pvc_arg: str) -> None: expected_metrics: list[str] = [ r'metrics_system_num_active_sources\{context="metricssystem",hostname="hdfs-datanode-default-\d+', r'datanode_blocks_get_local_path_info\{sessionid="null",context="dfs",hostname="hdfs-datanode-default-\d+"\}', @@ -47,17 +44,21 @@ def check_datanode_metrics( r'jvm_metrics_gc_count\{context="jvm",processname="DataNode",sessionid="null",hostname="hdfs-datanode-default-\d+"\}', ] - # metrics change depending on datanode pvcs + # metrics change depending on datanode pvcs if datanode_pvc_arg == "default": - expected_metrics.extend([ - r'org_apache_hadoop_hdfs_server_datanode_fsdataset_impl_fs_dataset_impl_capacity\{context="FSDatasetState",storageinfo="FSDataset\{dirpath=\'\[/stackable/data/data/datanode]\'\}",hostname="hdfs-datanode-default-\d+"\}', - r'org_apache_hadoop_hdfs_server_datanode_fsdataset_impl_fs_dataset_impl_estimated_capacity_lost_total\{context="FSDatasetState",storageinfo="FSDataset\{dirpath=\'\[/stackable/data/data/datanode]\'\}",hostname="hdfs-datanode-default-\d+"\}', - ]) + expected_metrics.extend( + [ + r'org_apache_hadoop_hdfs_server_datanode_fsdataset_impl_fs_dataset_impl_capacity\{context="FSDatasetState",storageinfo="FSDataset\{dirpath=\'\[/stackable/data/data/datanode]\'\}",hostname="hdfs-datanode-default-\d+"\}', + r'org_apache_hadoop_hdfs_server_datanode_fsdataset_impl_fs_dataset_impl_estimated_capacity_lost_total\{context="FSDatasetState",storageinfo="FSDataset\{dirpath=\'\[/stackable/data/data/datanode]\'\}",hostname="hdfs-datanode-default-\d+"\}', + ] + ) else: - expected_metrics.extend([ - r'org_apache_hadoop_hdfs_server_datanode_fsdataset_impl_fs_dataset_impl_capacity\{context="FSDatasetState",storageinfo="FSDataset\{dirpath=\'\[/stackable/data/hdd/datanode, /stackable/data/hdd-1/datanode, /stackable/data/ssd/datanode]\'\}",hostname="hdfs-datanode-default-\d+"\}', - r'org_apache_hadoop_hdfs_server_datanode_fsdataset_impl_fs_dataset_impl_estimated_capacity_lost_total\{context="FSDatasetState",storageinfo="FSDataset\{dirpath=\'\[/stackable/data/hdd/datanode, /stackable/data/hdd-1/datanode, /stackable/data/ssd/datanode]\'\}",hostname="hdfs-datanode-default-\d+"\}', - ]) + expected_metrics.extend( + [ + r'org_apache_hadoop_hdfs_server_datanode_fsdataset_impl_fs_dataset_impl_capacity\{context="FSDatasetState",storageinfo="FSDataset\{dirpath=\'\[/stackable/data/hdd/datanode, /stackable/data/hdd-1/datanode, /stackable/data/ssd/datanode]\'\}",hostname="hdfs-datanode-default-\d+"\}', + r'org_apache_hadoop_hdfs_server_datanode_fsdataset_impl_fs_dataset_impl_estimated_capacity_lost_total\{context="FSDatasetState",storageinfo="FSDataset\{dirpath=\'\[/stackable/data/hdd/datanode, /stackable/data/hdd-1/datanode, /stackable/data/ssd/datanode]\'\}",hostname="hdfs-datanode-default-\d+"\}', + ] + ) check_metrics(namespace, "datanode", 9864, expected_metrics) From 59f8249ea747900200e252f995596d3b0d7192ad Mon Sep 17 00:00:00 2001 From: Malte Sander Date: Mon, 27 Oct 2025 14:02:09 +0100 Subject: [PATCH 18/18] adapted changelog --- CHANGELOG.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3ab031cd..3b7d81fb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -24,6 +24,7 @@ All notable changes to this project will be documented in this file. - The built-in Prometheus servlet is now enabled and metrics are exposed under the `/prom` path of all UI services ([#695]). - Add several properties to `hdfs-site.xml` and `core-site.xml` that improve general performance and reliability ([#696]). - Add RBAC rule to helm template for automatic cluster domain detection ([#699]). +- Add `prometheus.io/path|port|scheme` annotations to metrics service ([#721]). ### Changed @@ -48,6 +49,9 @@ All notable changes to this project will be documented in this file. - The CLI argument `--kubernetes-node-name` or env variable `KUBERNETES_NODE_NAME` needs to be set. The helm-chart takes care of this. - The operator helm-chart now grants RBAC `patch` permissions on `events.k8s.io/events`, so events can be aggregated (e.g. "error happened 10 times over the last 5 minutes") ([#700]). +- BREAKING: Renamed headless rolegroup service from `--` to `---metrics` ([#721]). + - The `prometheus.io/scrape` label was moved to the metrics service + - The headless service now only exposes product / data ports, the metrics service only metrics ports ### Fixed @@ -76,6 +80,7 @@ All notable changes to this project will be documented in this file. [#697]: https://github.com/stackabletech/hdfs-operator/pull/697 [#699]: https://github.com/stackabletech/hdfs-operator/pull/699 [#700]: https://github.com/stackabletech/hdfs-operator/pull/700 +[#721]: https://github.com/stackabletech/hdfs-operator/pull/721 ## [25.3.0] - 2025-03-21