diff --git a/CHANGELOG.md b/CHANGELOG.md index 3ab031cd..3b7d81fb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -24,6 +24,7 @@ All notable changes to this project will be documented in this file. - The built-in Prometheus servlet is now enabled and metrics are exposed under the `/prom` path of all UI services ([#695]). - Add several properties to `hdfs-site.xml` and `core-site.xml` that improve general performance and reliability ([#696]). - Add RBAC rule to helm template for automatic cluster domain detection ([#699]). +- Add `prometheus.io/path|port|scheme` annotations to metrics service ([#721]). ### Changed @@ -48,6 +49,9 @@ All notable changes to this project will be documented in this file. - The CLI argument `--kubernetes-node-name` or env variable `KUBERNETES_NODE_NAME` needs to be set. The helm-chart takes care of this. - The operator helm-chart now grants RBAC `patch` permissions on `events.k8s.io/events`, so events can be aggregated (e.g. "error happened 10 times over the last 5 minutes") ([#700]). +- BREAKING: Renamed headless rolegroup service from `--` to `---metrics` ([#721]). + - The `prometheus.io/scrape` label was moved to the metrics service + - The headless service now only exposes product / data ports, the metrics service only metrics ports ### Fixed @@ -76,6 +80,7 @@ All notable changes to this project will be documented in this file. [#697]: https://github.com/stackabletech/hdfs-operator/pull/697 [#699]: https://github.com/stackabletech/hdfs-operator/pull/699 [#700]: https://github.com/stackabletech/hdfs-operator/pull/700 +[#721]: https://github.com/stackabletech/hdfs-operator/pull/721 ## [25.3.0] - 2025-03-21 diff --git a/docs/modules/hdfs/pages/usage-guide/monitoring.adoc b/docs/modules/hdfs/pages/usage-guide/monitoring.adoc index 53c52956..9c6d5eee 100644 --- a/docs/modules/hdfs/pages/usage-guide/monitoring.adoc +++ b/docs/modules/hdfs/pages/usage-guide/monitoring.adoc @@ -1,17 +1,25 @@ = Monitoring -:description: The HDFS cluster can be monitored with Prometheus from inside or outside the K8S cluster. +:description: The HDFS cluster is automatically configured to export Prometheus metrics. The cluster can be monitored with Prometheus from inside or outside the K8S cluster. -All services (with the exception of the Zookeeper daemon on the node names) run with the JMX exporter agent enabled and expose metrics on the `metrics` port. -This port is available from the container level up to the NodePort services. +The managed HDFS stacklets are automatically configured to export Prometheus metrics. +See xref:operators:monitoring.adoc[] for more details. [IMPORTANT] ==== -Starting with Stackable Data Platform 25.7, the built-in Prometheus metrics are also available at the `/prom` endpoint of all the UI services. +Starting with Stackable Data Platform 25.7, the built-in Prometheus metrics are available at the `/prom` endpoint of all the UI services. The JMX exporter metrics are now deprecated and will be removed in a future release. ==== -The metrics endpoints are also used as liveliness probes by Kubernetes. +This endpoint, in the case of the Namenode service, is reachable via the the `metrics` service: +[source,shell] +---- +http://-namenode--metrics:9870/prom +---- -See xref:operators:monitoring.adoc[] for more details. +== Authentication when using TLS + +HDFS exposes metrics through the same port as their web UI. Hence, when configuring HDFS with TLS the metrics are also secured by TLS, +and the clients scraping the metrics endpoint need to authenticate against it. This could for example be accomplished by utilizing mTLS +between Kubernetes Pods with the xref:home:secret-operator:index.adoc[Secret Operator]. diff --git a/rust/operator-binary/src/container.rs b/rust/operator-binary/src/container.rs index 4110b48c..b09be984 100644 --- a/rust/operator-binary/src/container.rs +++ b/rust/operator-binary/src/container.rs @@ -48,6 +48,7 @@ use stackable_operator::{ CustomContainerLogConfig, }, }, + role_utils::RoleGroupRef, utils::{COMMON_BASH_TRAP_FUNCTIONS, cluster_info::KubernetesClusterInfo}, }; use strum::{Display, EnumDiscriminants, IntoStaticStr}; @@ -216,24 +217,25 @@ impl ContainerConfig { hdfs: &v1alpha1::HdfsCluster, cluster_info: &KubernetesClusterInfo, role: &HdfsNodeRole, - role_group: &str, + rolegroup_ref: &RoleGroupRef, resolved_product_image: &ResolvedProductImage, merged_config: &AnyNodeConfig, env_overrides: Option<&BTreeMap>, zk_config_map_name: &str, - object_name: &str, namenode_podrefs: &[HdfsPodRef], labels: &Labels, ) -> Result<(), Error> { // HDFS main container let main_container_config = Self::from(*role); - pb.add_volumes(main_container_config.volumes(merged_config, object_name, labels)?) + let object_name = rolegroup_ref.object_name(); + + pb.add_volumes(main_container_config.volumes(merged_config, &object_name, labels)?) .context(AddVolumeSnafu)?; pb.add_container(main_container_config.main_container( hdfs, cluster_info, role, - role_group, + rolegroup_ref, resolved_product_image, zk_config_map_name, env_overrides, @@ -277,6 +279,8 @@ impl ContainerConfig { ) .with_pod_scope() .with_node_scope() + // To scrape metrics behind TLS endpoint (without FQDN) + .with_service_scope(rolegroup_ref.rolegroup_metrics_service_name()) .with_format(SecretFormat::TlsPkcs12) .with_tls_pkcs12_password(TLS_STORE_PASSWORD) .with_auto_tls_cert_lifetime( @@ -319,7 +323,7 @@ impl ContainerConfig { let zkfc_container_config = Self::try_from(NameNodeContainer::Zkfc.to_string())?; pb.add_volumes(zkfc_container_config.volumes( merged_config, - object_name, + &object_name, labels, )?) .context(AddVolumeSnafu)?; @@ -327,7 +331,7 @@ impl ContainerConfig { hdfs, cluster_info, role, - role_group, + rolegroup_ref, resolved_product_image, zk_config_map_name, env_overrides, @@ -340,7 +344,7 @@ impl ContainerConfig { Self::try_from(NameNodeContainer::FormatNameNodes.to_string())?; pb.add_volumes(format_namenodes_container_config.volumes( merged_config, - object_name, + &object_name, labels, )?) .context(AddVolumeSnafu)?; @@ -348,7 +352,7 @@ impl ContainerConfig { hdfs, cluster_info, role, - role_group, + &rolegroup_ref.role_group, resolved_product_image, zk_config_map_name, env_overrides, @@ -362,7 +366,7 @@ impl ContainerConfig { Self::try_from(NameNodeContainer::FormatZooKeeper.to_string())?; pb.add_volumes(format_zookeeper_container_config.volumes( merged_config, - object_name, + &object_name, labels, )?) .context(AddVolumeSnafu)?; @@ -370,7 +374,7 @@ impl ContainerConfig { hdfs, cluster_info, role, - role_group, + &rolegroup_ref.role_group, resolved_product_image, zk_config_map_name, env_overrides, @@ -385,7 +389,7 @@ impl ContainerConfig { Self::try_from(DataNodeContainer::WaitForNameNodes.to_string())?; pb.add_volumes(wait_for_namenodes_container_config.volumes( merged_config, - object_name, + &object_name, labels, )?) .context(AddVolumeSnafu)?; @@ -393,7 +397,7 @@ impl ContainerConfig { hdfs, cluster_info, role, - role_group, + &rolegroup_ref.role_group, resolved_product_image, zk_config_map_name, env_overrides, @@ -462,7 +466,7 @@ impl ContainerConfig { hdfs: &v1alpha1::HdfsCluster, cluster_info: &KubernetesClusterInfo, role: &HdfsNodeRole, - role_group: &str, + rolegroup_ref: &RoleGroupRef, resolved_product_image: &ResolvedProductImage, zookeeper_config_map_name: &str, env_overrides: Option<&BTreeMap>, @@ -481,7 +485,7 @@ impl ContainerConfig { .args(self.args(hdfs, cluster_info, role, merged_config, &[])?) .add_env_vars(self.env( hdfs, - role_group, + &rolegroup_ref.role_group, zookeeper_config_map_name, env_overrides, resources.as_ref(), @@ -1249,16 +1253,18 @@ wait_for_termination $! /// Container ports for the main containers namenode, datanode and journalnode. fn container_ports(&self, hdfs: &v1alpha1::HdfsCluster) -> Vec { match self { - ContainerConfig::Hdfs { role, .. } => hdfs - .ports(role) - .into_iter() - .map(|(name, value)| ContainerPort { - name: Some(name), - container_port: i32::from(value), - protocol: Some("TCP".to_string()), - ..ContainerPort::default() - }) - .collect(), + ContainerConfig::Hdfs { role, .. } => { + // data ports + hdfs.hdfs_main_container_ports(role) + .into_iter() + .map(|(name, value)| ContainerPort { + name: Some(name), + container_port: i32::from(value), + protocol: Some("TCP".to_string()), + ..ContainerPort::default() + }) + .collect() + } _ => { vec![] } diff --git a/rust/operator-binary/src/crd/constants.rs b/rust/operator-binary/src/crd/constants.rs index 876da8f3..44266d0c 100644 --- a/rust/operator-binary/src/crd/constants.rs +++ b/rust/operator-binary/src/crd/constants.rs @@ -20,21 +20,28 @@ pub const SERVICE_PORT_NAME_HTTP: &str = "http"; pub const SERVICE_PORT_NAME_HTTPS: &str = "https"; pub const SERVICE_PORT_NAME_DATA: &str = "data"; pub const SERVICE_PORT_NAME_METRICS: &str = "metrics"; +pub const SERVICE_PORT_NAME_JMX_METRICS: &str = "jmx-metrics"; pub const DEFAULT_LISTENER_CLASS: &str = "cluster-internal"; pub const DEFAULT_NAME_NODE_METRICS_PORT: u16 = 8183; +pub const DEFAULT_NAME_NODE_NATIVE_METRICS_HTTP_PORT: u16 = 9870; +pub const DEFAULT_NAME_NODE_NATIVE_METRICS_HTTPS_PORT: u16 = 9871; pub const DEFAULT_NAME_NODE_HTTP_PORT: u16 = 9870; pub const DEFAULT_NAME_NODE_HTTPS_PORT: u16 = 9871; pub const DEFAULT_NAME_NODE_RPC_PORT: u16 = 8020; pub const DEFAULT_DATA_NODE_METRICS_PORT: u16 = 8082; +pub const DEFAULT_DATA_NODE_NATIVE_METRICS_HTTP_PORT: u16 = 9864; +pub const DEFAULT_DATA_NODE_NATIVE_METRICS_HTTPS_PORT: u16 = 9865; pub const DEFAULT_DATA_NODE_HTTP_PORT: u16 = 9864; pub const DEFAULT_DATA_NODE_HTTPS_PORT: u16 = 9865; pub const DEFAULT_DATA_NODE_DATA_PORT: u16 = 9866; pub const DEFAULT_DATA_NODE_IPC_PORT: u16 = 9867; pub const DEFAULT_JOURNAL_NODE_METRICS_PORT: u16 = 8081; +pub const DEFAULT_JOURNAL_NODE_NATIVE_METRICS_HTTP_PORT: u16 = 8480; +pub const DEFAULT_JOURNAL_NODE_NATIVE_METRICS_HTTPS_PORT: u16 = 8481; pub const DEFAULT_JOURNAL_NODE_HTTP_PORT: u16 = 8480; pub const DEFAULT_JOURNAL_NODE_HTTPS_PORT: u16 = 8481; pub const DEFAULT_JOURNAL_NODE_RPC_PORT: u16 = 8485; diff --git a/rust/operator-binary/src/crd/mod.rs b/rust/operator-binary/src/crd/mod.rs index 598156cd..96cdb1fe 100644 --- a/rust/operator-binary/src/crd/mod.rs +++ b/rust/operator-binary/src/crd/mod.rs @@ -54,15 +54,19 @@ use crate::crd::{ APP_NAME, CORE_SITE_XML, DEFAULT_DATA_NODE_DATA_PORT, DEFAULT_DATA_NODE_GRACEFUL_SHUTDOWN_TIMEOUT, DEFAULT_DATA_NODE_HTTP_PORT, DEFAULT_DATA_NODE_HTTPS_PORT, DEFAULT_DATA_NODE_IPC_PORT, DEFAULT_DATA_NODE_METRICS_PORT, + DEFAULT_DATA_NODE_NATIVE_METRICS_HTTP_PORT, DEFAULT_DATA_NODE_NATIVE_METRICS_HTTPS_PORT, DEFAULT_DFS_REPLICATION_FACTOR, DEFAULT_JOURNAL_NODE_GRACEFUL_SHUTDOWN_TIMEOUT, DEFAULT_JOURNAL_NODE_HTTP_PORT, DEFAULT_JOURNAL_NODE_HTTPS_PORT, - DEFAULT_JOURNAL_NODE_METRICS_PORT, DEFAULT_JOURNAL_NODE_RPC_PORT, DEFAULT_LISTENER_CLASS, - DEFAULT_NAME_NODE_GRACEFUL_SHUTDOWN_TIMEOUT, DEFAULT_NAME_NODE_HTTP_PORT, - DEFAULT_NAME_NODE_HTTPS_PORT, DEFAULT_NAME_NODE_METRICS_PORT, DEFAULT_NAME_NODE_RPC_PORT, - DFS_REPLICATION, HADOOP_POLICY_XML, HDFS_SITE_XML, JVM_SECURITY_PROPERTIES_FILE, - LISTENER_VOLUME_NAME, SERVICE_PORT_NAME_DATA, SERVICE_PORT_NAME_HTTP, - SERVICE_PORT_NAME_HTTPS, SERVICE_PORT_NAME_IPC, SERVICE_PORT_NAME_METRICS, - SERVICE_PORT_NAME_RPC, SSL_CLIENT_XML, SSL_SERVER_XML, + DEFAULT_JOURNAL_NODE_METRICS_PORT, DEFAULT_JOURNAL_NODE_NATIVE_METRICS_HTTP_PORT, + DEFAULT_JOURNAL_NODE_NATIVE_METRICS_HTTPS_PORT, DEFAULT_JOURNAL_NODE_RPC_PORT, + DEFAULT_LISTENER_CLASS, DEFAULT_NAME_NODE_GRACEFUL_SHUTDOWN_TIMEOUT, + DEFAULT_NAME_NODE_HTTP_PORT, DEFAULT_NAME_NODE_HTTPS_PORT, DEFAULT_NAME_NODE_METRICS_PORT, + DEFAULT_NAME_NODE_NATIVE_METRICS_HTTP_PORT, DEFAULT_NAME_NODE_NATIVE_METRICS_HTTPS_PORT, + DEFAULT_NAME_NODE_RPC_PORT, DFS_REPLICATION, HADOOP_POLICY_XML, HDFS_SITE_XML, + JVM_SECURITY_PROPERTIES_FILE, LISTENER_VOLUME_NAME, SERVICE_PORT_NAME_DATA, + SERVICE_PORT_NAME_HTTP, SERVICE_PORT_NAME_HTTPS, SERVICE_PORT_NAME_IPC, + SERVICE_PORT_NAME_JMX_METRICS, SERVICE_PORT_NAME_METRICS, SERVICE_PORT_NAME_RPC, + SSL_CLIENT_XML, SSL_SERVER_XML, }, security::{AuthenticationConfig, KerberosConfig}, storage::{ @@ -387,10 +391,10 @@ impl v1alpha1::HdfsCluster { let ns = ns.clone(); (0..*replicas).map(move |i| HdfsPodRef { namespace: ns.clone(), - role_group_service_name: rolegroup_ref.object_name(), + role_group_service_name: rolegroup_ref.rolegroup_headless_service_name(), pod_name: format!("{}-{}", rolegroup_ref.object_name(), i), ports: self - .ports(role) + .data_ports(role) .iter() .map(|(n, p)| (n.clone(), *p)) .collect(), @@ -667,14 +671,77 @@ impl v1alpha1::HdfsCluster { .sum() } + pub fn native_metrics_port(&self, role: &HdfsNodeRole) -> u16 { + match role { + HdfsNodeRole::Name => { + if self.has_https_enabled() { + DEFAULT_NAME_NODE_NATIVE_METRICS_HTTPS_PORT + } else { + DEFAULT_NAME_NODE_NATIVE_METRICS_HTTP_PORT + } + } + HdfsNodeRole::Data => { + if self.has_https_enabled() { + DEFAULT_DATA_NODE_NATIVE_METRICS_HTTPS_PORT + } else { + DEFAULT_DATA_NODE_NATIVE_METRICS_HTTP_PORT + } + } + HdfsNodeRole::Journal => { + if self.has_https_enabled() { + DEFAULT_JOURNAL_NODE_NATIVE_METRICS_HTTPS_PORT + } else { + DEFAULT_JOURNAL_NODE_NATIVE_METRICS_HTTP_PORT + } + } + } + } + + /// Deprecated required JMX metrics port name and metrics port number tuples depending on the role. + pub fn jmx_metrics_ports(&self, role: &HdfsNodeRole) -> Vec<(String, u16)> { + match role { + HdfsNodeRole::Name => vec![( + String::from(SERVICE_PORT_NAME_JMX_METRICS), + DEFAULT_NAME_NODE_METRICS_PORT, + )], + HdfsNodeRole::Data => vec![( + String::from(SERVICE_PORT_NAME_JMX_METRICS), + DEFAULT_DATA_NODE_METRICS_PORT, + )], + HdfsNodeRole::Journal => vec![( + String::from(SERVICE_PORT_NAME_JMX_METRICS), + DEFAULT_JOURNAL_NODE_METRICS_PORT, + )], + } + } + + pub fn metrics_service_ports(&self, role: &HdfsNodeRole) -> Vec<(String, u16)> { + let mut metrics_service_ports = vec![]; + // "native" ports + metrics_service_ports.extend(self.native_metrics_ports(role)); + // deprecated jmx ports + metrics_service_ports.extend(self.jmx_metrics_ports(role)); + metrics_service_ports + } + + pub fn headless_service_ports(&self, role: &HdfsNodeRole) -> Vec<(String, u16)> { + let mut headless_service_ports = vec![]; + headless_service_ports.extend(self.data_ports(role)); + headless_service_ports + } + + pub fn hdfs_main_container_ports(&self, role: &HdfsNodeRole) -> Vec<(String, u16)> { + let mut main_container_ports = vec![]; + main_container_ports.extend(self.data_ports(role)); + // TODO: This will be exposed in the listener if added to container ports? + // main_container_ports.extend(self.jmx_metrics_ports(role)); + main_container_ports + } + /// Returns required port name and port number tuples depending on the role. - pub fn ports(&self, role: &HdfsNodeRole) -> Vec<(String, u16)> { + fn data_ports(&self, role: &HdfsNodeRole) -> Vec<(String, u16)> { match role { HdfsNodeRole::Name => vec![ - ( - String::from(SERVICE_PORT_NAME_METRICS), - DEFAULT_NAME_NODE_METRICS_PORT, - ), ( String::from(SERVICE_PORT_NAME_RPC), DEFAULT_NAME_NODE_RPC_PORT, @@ -692,10 +759,6 @@ impl v1alpha1::HdfsCluster { }, ], HdfsNodeRole::Data => vec![ - ( - String::from(SERVICE_PORT_NAME_METRICS), - DEFAULT_DATA_NODE_METRICS_PORT, - ), ( String::from(SERVICE_PORT_NAME_DATA), DEFAULT_DATA_NODE_DATA_PORT, @@ -717,10 +780,6 @@ impl v1alpha1::HdfsCluster { }, ], HdfsNodeRole::Journal => vec![ - ( - String::from(SERVICE_PORT_NAME_METRICS), - DEFAULT_JOURNAL_NODE_METRICS_PORT, - ), ( String::from(SERVICE_PORT_NAME_RPC), DEFAULT_JOURNAL_NODE_RPC_PORT, @@ -739,6 +798,45 @@ impl v1alpha1::HdfsCluster { ], } } + + /// Returns required native metrics port name and metrics port number tuples depending on the role and security settings. + fn native_metrics_ports(&self, role: &HdfsNodeRole) -> Vec<(String, u16)> { + match role { + HdfsNodeRole::Name => vec![if self.has_https_enabled() { + ( + String::from(SERVICE_PORT_NAME_METRICS), + DEFAULT_NAME_NODE_NATIVE_METRICS_HTTPS_PORT, + ) + } else { + ( + String::from(SERVICE_PORT_NAME_METRICS), + DEFAULT_NAME_NODE_NATIVE_METRICS_HTTP_PORT, + ) + }], + HdfsNodeRole::Data => vec![if self.has_https_enabled() { + ( + String::from(SERVICE_PORT_NAME_METRICS), + DEFAULT_DATA_NODE_NATIVE_METRICS_HTTPS_PORT, + ) + } else { + ( + String::from(SERVICE_PORT_NAME_METRICS), + DEFAULT_DATA_NODE_NATIVE_METRICS_HTTP_PORT, + ) + }], + HdfsNodeRole::Journal => vec![if self.has_https_enabled() { + ( + String::from(SERVICE_PORT_NAME_METRICS), + DEFAULT_JOURNAL_NODE_NATIVE_METRICS_HTTPS_PORT, + ) + } else { + ( + String::from(SERVICE_PORT_NAME_METRICS), + DEFAULT_JOURNAL_NODE_NATIVE_METRICS_HTTP_PORT, + ) + }], + } + } } #[derive(Clone, Debug, Deserialize, Eq, Hash, JsonSchema, PartialEq, Serialize)] diff --git a/rust/operator-binary/src/hdfs_controller.rs b/rust/operator-binary/src/hdfs_controller.rs index 0eea1e6d..2b7b1e07 100644 --- a/rust/operator-binary/src/hdfs_controller.rs +++ b/rust/operator-binary/src/hdfs_controller.rs @@ -27,7 +27,7 @@ use stackable_operator::{ DeepMerge, api::{ apps::v1::{StatefulSet, StatefulSetSpec}, - core::v1::{ConfigMap, Service, ServiceAccount, ServicePort, ServiceSpec}, + core::v1::{ConfigMap, ServiceAccount}, }, apimachinery::pkg::apis::meta::v1::LabelSelector, }, @@ -37,7 +37,7 @@ use stackable_operator::{ core::{DeserializeGuard, error_boundary}, runtime::{controller::Action, events::Recorder, reflector::ObjectRef}, }, - kvp::{Label, LabelError, Labels}, + kvp::{LabelError, Labels}, logging::controller::ReconcilerError, product_config_utils::{transform_all_roles_to_config, validate_all_roles_and_groups_config}, role_utils::{GenericRoleConfig, RoleGroupRef}, @@ -69,6 +69,7 @@ use crate::{ }, product_logging::extend_role_group_config_map, security::{self, kerberos, opa::HdfsOpaConfig}, + service::{self, rolegroup_headless_service, rolegroup_metrics_service}, }; pub const RESOURCE_MANAGER_HDFS_CONTROLLER: &str = "hdfs-operator-hdfs-controller"; @@ -218,15 +219,9 @@ pub enum Error { #[snafu(display("failed to build roleGroup selector labels"))] RoleGroupSelectorLabels { source: crate::crd::Error }, - #[snafu(display("failed to build prometheus label"))] - BuildPrometheusLabel { source: LabelError }, - #[snafu(display("failed to build cluster resources label"))] BuildClusterResourcesLabel { source: LabelError }, - #[snafu(display("failed to build role-group selector label"))] - BuildRoleGroupSelectorLabel { source: LabelError }, - #[snafu(display("failed to build role-group volume claim templates from config"))] BuildRoleGroupVolumeClaimTemplates { source: container::Error }, @@ -250,6 +245,9 @@ pub enum Error { ResolveProductImage { source: product_image_selection::Error, }, + + #[snafu(display("failed to builds service"))] + BuildService { source: service::Error }, } impl ReconcilerError for Error { @@ -392,6 +390,13 @@ pub async fn reconcile_hdfs( let rolegroup_ref = hdfs.rolegroup_ref(role_name, rolegroup_name); + let rg_service = + rolegroup_headless_service(hdfs, &role, &rolegroup_ref, &resolved_product_image) + .context(BuildServiceSnafu)?; + let rg_metrics_service = + rolegroup_metrics_service(hdfs, &role, &rolegroup_ref, &resolved_product_image) + .context(BuildServiceSnafu)?; + // We need to split the creation and the usage of the "metadata" variable in two statements. // to avoid the compiler error "E0716 (temporary value dropped while borrowed)". let mut metadata = ObjectMetaBuilder::new(); @@ -411,8 +416,6 @@ pub async fn reconcile_hdfs( )) .context(ObjectMetaSnafu)?; - let rg_service = rolegroup_service(hdfs, metadata, &role, &rolegroup_ref)?; - let rg_configmap = rolegroup_config_map( hdfs, &client.kubernetes_cluster_info, @@ -439,12 +442,20 @@ pub async fn reconcile_hdfs( )?; let rg_service_name = rg_service.name_any(); + let rg_metrics_service_name = rg_metrics_service.name_any(); + cluster_resources .add(client, rg_service) .await .with_context(|_| ApplyRoleGroupServiceSnafu { name: rg_service_name, })?; + cluster_resources + .add(client, rg_metrics_service) + .await + .with_context(|_| ApplyRoleGroupServiceSnafu { + name: rg_metrics_service_name, + })?; let rg_configmap_name = rg_configmap.name_any(); cluster_resources .add(client, rg_configmap.clone()) @@ -560,50 +571,6 @@ pub async fn reconcile_hdfs( Ok(Action::await_change()) } -fn rolegroup_service( - hdfs: &v1alpha1::HdfsCluster, - metadata: &ObjectMetaBuilder, - role: &HdfsNodeRole, - rolegroup_ref: &RoleGroupRef, -) -> HdfsOperatorResult { - tracing::info!("Setting up Service for {:?}", rolegroup_ref); - - let prometheus_label = - Label::try_from(("prometheus.io/scrape", "true")).context(BuildPrometheusLabelSnafu)?; - let mut metadata_with_prometheus_label = metadata.clone(); - metadata_with_prometheus_label.with_label(prometheus_label); - - let service_spec = ServiceSpec { - // Internal communication does not need to be exposed - type_: Some("ClusterIP".to_string()), - cluster_ip: Some("None".to_string()), - ports: Some( - hdfs.ports(role) - .into_iter() - .map(|(name, value)| ServicePort { - name: Some(name), - port: i32::from(value), - protocol: Some("TCP".to_string()), - ..ServicePort::default() - }) - .collect(), - ), - selector: Some( - hdfs.rolegroup_selector_labels(rolegroup_ref) - .context(RoleGroupSelectorLabelsSnafu)? - .into(), - ), - publish_not_ready_addresses: Some(true), - ..ServiceSpec::default() - }; - - Ok(Service { - metadata: metadata_with_prometheus_label.build(), - spec: Some(service_spec), - status: None, - }) -} - #[allow(clippy::too_many_arguments)] fn rolegroup_config_map( hdfs: &v1alpha1::HdfsCluster, @@ -857,7 +824,6 @@ fn rolegroup_statefulset( ) -> HdfsOperatorResult { tracing::info!("Setting up StatefulSet for {:?}", rolegroup_ref); - let object_name = rolegroup_ref.object_name(); // PodBuilder for StatefulSet Pod template. let mut pb = PodBuilder::new(); @@ -887,12 +853,11 @@ fn rolegroup_statefulset( hdfs, cluster_info, role, - &rolegroup_ref.role_group, + rolegroup_ref, resolved_product_image, merged_config, env_overrides, &hdfs.spec.cluster_config.zookeeper_config_map_name, - &object_name, namenode_podrefs, &rolegroup_selector_labels, ) @@ -922,7 +887,7 @@ fn rolegroup_statefulset( match_labels: Some(rolegroup_selector_labels.into()), ..LabelSelector::default() }, - service_name: Some(object_name), + service_name: Some(rolegroup_ref.rolegroup_headless_service_name()), template: pod_template, volume_claim_templates: Some(pvcs), @@ -1013,6 +978,7 @@ properties: [] .unwrap() .get("default") .unwrap(); + let rolegroup_ref = hdfs.rolegroup_ref(role.to_string(), "default"); let env_overrides = rolegroup_config.get(&PropertyNameKind::Env); let merged_config = role.merged_config(&hdfs, "default").unwrap(); @@ -1031,12 +997,11 @@ properties: [] cluster_domain: DomainName::try_from("cluster.local").unwrap(), }, &role, - "default", + &rolegroup_ref, &resolved_product_image, &merged_config, env_overrides, &hdfs.spec.cluster_config.zookeeper_config_map_name, - "todo", &[], &Labels::new(), ) diff --git a/rust/operator-binary/src/main.rs b/rust/operator-binary/src/main.rs index 4ea60b2b..3701ac3b 100644 --- a/rust/operator-binary/src/main.rs +++ b/rust/operator-binary/src/main.rs @@ -47,6 +47,7 @@ mod hdfs_controller; mod operations; mod product_logging; mod security; +mod service; mod built_info { include!(concat!(env!("OUT_DIR"), "/built.rs")); diff --git a/rust/operator-binary/src/service.rs b/rust/operator-binary/src/service.rs new file mode 100644 index 00000000..bac5cdca --- /dev/null +++ b/rust/operator-binary/src/service.rs @@ -0,0 +1,171 @@ +use snafu::{ResultExt, Snafu}; +use stackable_operator::{ + builder::meta::ObjectMetaBuilder, + commons::product_image_selection::ResolvedProductImage, + k8s_openapi::api::core::v1::{Service, ServicePort, ServiceSpec}, + kube::runtime::reflector::ObjectRef, + kvp::{Annotations, Label, LabelError}, + role_utils::RoleGroupRef, +}; + +use crate::{ + build_recommended_labels, + crd::{HdfsNodeRole, v1alpha1}, + hdfs_controller::RESOURCE_MANAGER_HDFS_CONTROLLER, +}; + +#[derive(Snafu, Debug)] +pub enum Error { + #[snafu(display("failed to build prometheus label"))] + BuildPrometheusLabel { source: LabelError }, + + #[snafu(display("failed to build role-group selector label"))] + BuildRoleGroupSelectorLabel { source: LabelError }, + + #[snafu(display("failed to build object meta data"))] + ObjectMeta { + source: stackable_operator::builder::meta::Error, + }, + + #[snafu(display("no metadata for {obj_ref:?}"))] + ObjectMissingMetadataForOwnerRef { + source: stackable_operator::builder::meta::Error, + obj_ref: ObjectRef, + }, + + #[snafu(display("failed to build roleGroup selector labels"))] + RoleGroupSelectorLabels { source: crate::crd::Error }, +} + +pub(crate) fn rolegroup_headless_service( + hdfs: &v1alpha1::HdfsCluster, + role: &HdfsNodeRole, + rolegroup_ref: &RoleGroupRef, + resolved_product_image: &ResolvedProductImage, +) -> Result { + tracing::info!("Setting up Service for {:?}", rolegroup_ref); + + let mut metadata_builder = ObjectMetaBuilder::new(); + metadata_builder + .name_and_namespace(hdfs) + .name(rolegroup_ref.rolegroup_headless_service_name()) + .ownerreference_from_resource(hdfs, None, Some(true)) + .with_context(|_| ObjectMissingMetadataForOwnerRefSnafu { + obj_ref: ObjectRef::from_obj(hdfs), + })? + .with_recommended_labels(build_recommended_labels( + hdfs, + RESOURCE_MANAGER_HDFS_CONTROLLER, + &resolved_product_image.app_version_label_value, + &rolegroup_ref.role, + &rolegroup_ref.role_group, + )) + .context(ObjectMetaSnafu)?; + + let service_spec = ServiceSpec { + // Internal communication does not need to be exposed + type_: Some("ClusterIP".to_string()), + cluster_ip: Some("None".to_string()), + ports: Some( + hdfs.headless_service_ports(role) + .into_iter() + .map(|(name, value)| ServicePort { + name: Some(name), + port: i32::from(value), + protocol: Some("TCP".to_string()), + ..ServicePort::default() + }) + .collect(), + ), + selector: Some( + hdfs.rolegroup_selector_labels(rolegroup_ref) + .context(RoleGroupSelectorLabelsSnafu)? + .into(), + ), + publish_not_ready_addresses: Some(true), + ..ServiceSpec::default() + }; + + Ok(Service { + metadata: metadata_builder.build(), + spec: Some(service_spec), + status: None, + }) +} + +pub(crate) fn rolegroup_metrics_service( + hdfs: &v1alpha1::HdfsCluster, + role: &HdfsNodeRole, + rolegroup_ref: &RoleGroupRef, + resolved_product_image: &ResolvedProductImage, +) -> Result { + tracing::info!("Setting up metrics Service for {:?}", rolegroup_ref); + + let service_spec = ServiceSpec { + // Internal communication does not need to be exposed + type_: Some("ClusterIP".to_string()), + cluster_ip: Some("None".to_string()), + ports: Some( + hdfs.metrics_service_ports(role) + .into_iter() + .map(|(name, value)| ServicePort { + name: Some(name), + port: i32::from(value), + protocol: Some("TCP".to_string()), + ..ServicePort::default() + }) + .collect(), + ), + selector: Some( + hdfs.rolegroup_selector_labels(rolegroup_ref) + .context(RoleGroupSelectorLabelsSnafu)? + .into(), + ), + publish_not_ready_addresses: Some(true), + ..ServiceSpec::default() + }; + + Ok(Service { + metadata: ObjectMetaBuilder::new() + .name_and_namespace(hdfs) + .name(rolegroup_ref.rolegroup_metrics_service_name()) + .ownerreference_from_resource(hdfs, None, Some(true)) + .with_context(|_| ObjectMissingMetadataForOwnerRefSnafu { + obj_ref: ObjectRef::from_obj(hdfs), + })? + .with_recommended_labels(build_recommended_labels( + hdfs, + RESOURCE_MANAGER_HDFS_CONTROLLER, + &resolved_product_image.app_version_label_value, + &rolegroup_ref.role, + &rolegroup_ref.role_group, + )) + .context(ObjectMetaSnafu)? + .with_label( + Label::try_from(("prometheus.io/scrape", "true")) + .context(BuildPrometheusLabelSnafu)?, + ) + .with_annotations( + Annotations::try_from([ + ("prometheus.io/path".to_owned(), "/prom".to_owned()), + ( + "prometheus.io/port".to_owned(), + hdfs.native_metrics_port(role).to_string(), + ), + ( + "prometheus.io/scheme".to_owned(), + if hdfs.has_https_enabled() { + "https".to_owned() + } else { + "http".to_owned() + }, + ), + ("prometheus.io/scrape".to_owned(), "true".to_owned()), + ]) + .expect("should be valid annotations"), + ) + .build(), + spec: Some(service_spec), + status: None, + }) +} diff --git a/tests/templates/kuttl/logging/test_log_aggregation.py b/tests/templates/kuttl/logging/test_log_aggregation.py index 561eff2c..845b3814 100755 --- a/tests/templates/kuttl/logging/test_log_aggregation.py +++ b/tests/templates/kuttl/logging/test_log_aggregation.py @@ -23,9 +23,9 @@ def check_sent_events(): }, ) - assert ( - response.status_code == 200 - ), "Cannot access the API of the vector aggregator." + assert response.status_code == 200, ( + "Cannot access the API of the vector aggregator." + ) result = response.json() @@ -35,13 +35,13 @@ def check_sent_events(): componentId = transform["componentId"] if componentId == "filteredInvalidEvents": - assert ( - sentEvents is None or sentEvents["sentEventsTotal"] == 0 - ), "Invalid log events were sent." + assert sentEvents is None or sentEvents["sentEventsTotal"] == 0, ( + "Invalid log events were sent." + ) else: - assert ( - sentEvents is not None and sentEvents["sentEventsTotal"] > 0 - ), f'No events were sent in "{componentId}".' + assert sentEvents is not None and sentEvents["sentEventsTotal"] > 0, ( + f'No events were sent in "{componentId}".' + ) if __name__ == "__main__": diff --git a/tests/templates/kuttl/profiling/run-profiler.py b/tests/templates/kuttl/profiling/run-profiler.py index 56727c63..ad46a3de 100644 --- a/tests/templates/kuttl/profiling/run-profiler.py +++ b/tests/templates/kuttl/profiling/run-profiler.py @@ -8,8 +8,7 @@ def start_profiling_and_get_refresh_header(service_url): prof_page = requests.get( - f"{service_url}/prof" - f"?event={EVENT_TYPE}&duration={PROFILING_DURATION_IN_SEC}" + f"{service_url}/prof?event={EVENT_TYPE}&duration={PROFILING_DURATION_IN_SEC}" ) assert prof_page.ok, f"""Profiling could not be started. @@ -56,7 +55,7 @@ def fetch_flamegraph(service_url, refresh_path): def test_profiling(role, port): service_url = ( - f"http://test-hdfs-{role}-default-0.test-hdfs-{role}-default" f":{port}" + f"http://test-hdfs-{role}-default-0.test-hdfs-{role}-default-headless:{port}" ) print(f"Test profiling on {service_url}") diff --git a/tests/templates/kuttl/smoke/20-install-zk.yaml.j2 b/tests/templates/kuttl/smoke/20-install-zk.yaml.j2 index 23fd5e7b..9785e363 100644 --- a/tests/templates/kuttl/smoke/20-install-zk.yaml.j2 +++ b/tests/templates/kuttl/smoke/20-install-zk.yaml.j2 @@ -7,9 +7,8 @@ spec: image: productVersion: "{{ test_scenario['values']['zookeeper'] }}" pullPolicy: IfNotPresent - clusterConfig: - listenerClass: {{ test_scenario['values']['listener-class'] }} {% if lookup('env', 'VECTOR_AGGREGATOR') %} + clusterConfig: vectorAggregatorConfigMapName: vector-aggregator-discovery {% endif %} servers: diff --git a/tests/templates/kuttl/smoke/30-assert.yaml.j2 b/tests/templates/kuttl/smoke/30-assert.yaml.j2 index 0b8dffa2..e685aa05 100644 --- a/tests/templates/kuttl/smoke/30-assert.yaml.j2 +++ b/tests/templates/kuttl/smoke/30-assert.yaml.j2 @@ -78,6 +78,100 @@ status: {% if test_scenario['values']['datanode-pvcs'] == '2hdd-1ssd' %} --- apiVersion: v1 +kind: Service +metadata: + name: hdfs-namenode-default-headless +spec: + ports: + - name: rpc + port: 8020 + protocol: TCP + targetPort: 8020 + - name: http + port: 9870 + protocol: TCP + targetPort: 9870 +--- +apiVersion: v1 +kind: Service +metadata: + name: hdfs-namenode-default-metrics +spec: + ports: + - name: metrics + port: 9870 + protocol: TCP + targetPort: 9870 + - name: jmx-metrics + port: 8183 + protocol: TCP + targetPort: 8183 +--- +apiVersion: v1 +kind: Service +metadata: + name: hdfs-datanode-default-headless +spec: + ports: + - name: data + port: 9866 + protocol: TCP + targetPort: 9866 + - name: ipc + port: 9867 + protocol: TCP + targetPort: 9867 + - name: http + port: 9864 + protocol: TCP + targetPort: 9864 +--- +apiVersion: v1 +kind: Service +metadata: + name: hdfs-datanode-default-metrics +spec: + ports: + - name: metrics + port: 9864 + protocol: TCP + targetPort: 9864 + - name: jmx-metrics + port: 8082 + protocol: TCP + targetPort: 8082 +--- +apiVersion: v1 +kind: Service +metadata: + name: hdfs-journalnode-default-headless +spec: + ports: + - name: rpc + port: 8485 + protocol: TCP + targetPort: 8485 + - name: http + port: 8480 + protocol: TCP + targetPort: 8480 +--- +apiVersion: v1 +kind: Service +metadata: + name: hdfs-journalnode-default-metrics +spec: + ports: + - name: metrics + port: 8480 + protocol: TCP + targetPort: 8480 + - name: jmx-metrics + port: 8081 + protocol: TCP + targetPort: 8081 +--- +apiVersion: v1 kind: PersistentVolumeClaim metadata: name: hdd-hdfs-datanode-default-0 diff --git a/tests/templates/kuttl/smoke/51-assert.yaml.j2 b/tests/templates/kuttl/smoke/51-assert.yaml.j2 index 6f57dda2..dcdeb801 100644 --- a/tests/templates/kuttl/smoke/51-assert.yaml.j2 +++ b/tests/templates/kuttl/smoke/51-assert.yaml.j2 @@ -8,9 +8,8 @@ commands: {% else %} PRODUCT_VERSION={{ test_scenario['values']['hadoop'] }} {% endif %} - # Test JMX exported metrics + # Test exported metrics kubectl exec --namespace=$NAMESPACE test-runner-0 -- \ - python /tmp/test_metrics.py $NAMESPACE $PRODUCT_VERSION - # Test Prometheus metrics + python /tmp/test_jmx_metrics.py $NAMESPACE $PRODUCT_VERSION kubectl exec --namespace=$NAMESPACE test-runner-0 -- \ - python /tmp/test_prometheus_metrics.py $NAMESPACE $PRODUCT_VERSION + python /tmp/test_prometheus_metrics.py $NAMESPACE {{ test_scenario['values']['datanode-pvcs'] }} diff --git a/tests/templates/kuttl/smoke/51-copy-metrics-test-script.yaml b/tests/templates/kuttl/smoke/51-copy-metrics-test-script.yaml index bb617f97..caefdc8d 100644 --- a/tests/templates/kuttl/smoke/51-copy-metrics-test-script.yaml +++ b/tests/templates/kuttl/smoke/51-copy-metrics-test-script.yaml @@ -2,5 +2,5 @@ apiVersion: kuttl.dev/v1beta1 kind: TestStep commands: - - script: kubectl cp -n $NAMESPACE ./test_metrics.py test-runner-0:/tmp + - script: kubectl cp -n $NAMESPACE ./test_jmx_metrics.py test-runner-0:/tmp - script: kubectl cp -n $NAMESPACE ./test_prometheus_metrics.py test-runner-0:/tmp diff --git a/tests/templates/kuttl/smoke/test_metrics.py b/tests/templates/kuttl/smoke/test_jmx_metrics.py similarity index 91% rename from tests/templates/kuttl/smoke/test_metrics.py rename to tests/templates/kuttl/smoke/test_jmx_metrics.py index 066c3e11..f88437c4 100755 --- a/tests/templates/kuttl/smoke/test_metrics.py +++ b/tests/templates/kuttl/smoke/test_jmx_metrics.py @@ -11,7 +11,7 @@ def check_metrics( namespace: str, role: str, port: int, expected_metrics: list[str] ) -> None: response: requests.Response = requests.get( - f"http://hdfs-{role}-default-0.hdfs-{role}-default.{namespace}.svc.cluster.local:{port}/metrics", + f"http://hdfs-{role}-default-metrics.{namespace}.svc.cluster.local:{port}/metrics", timeout=10, ) assert response.ok, "Requesting metrics failed" @@ -65,9 +65,9 @@ def check_datanode_metrics( # Kind "FSDatasetState" 'hadoop_datanode_capacity{fsdatasetid=".+",kind="FSDatasetState",role="DataNode",service="HDFS"}', # Kind "DataNodeActivity" suffixed with "_info" - 'hadoop_datanode_blocks_get_local_path_info_{host="hdfs-datanode-default-0\\.hdfs-datanode-default\\..+\\.svc\\.cluster\\.local",kind="DataNodeActivity",port="9866",role="DataNode",service="HDFS"}', + 'hadoop_datanode_blocks_get_local_path_info_{host="hdfs-datanode-default-\\d+\\.hdfs-datanode-default-headless\\..+\\.svc\\.cluster\\.local",kind="DataNodeActivity",port="9866",role="DataNode",service="HDFS"}', # Kind "DataNodeActivity" - 'hadoop_datanode_blocks_read{host="hdfs-datanode-default-0\\.hdfs-datanode-default\\..+\\.svc\\.cluster\\.local",kind="DataNodeActivity",port="9866",role="DataNode",service="HDFS"}', + 'hadoop_datanode_blocks_read{host="hdfs-datanode-default-\\d+\\.hdfs-datanode-default-headless\\..+\\.svc\\.cluster\\.local",kind="DataNodeActivity",port="9866",role="DataNode",service="HDFS"}', # Counter suffixed with "_total" 'hadoop_datanode_estimated_capacity_lost_total{kind="FSDatasetState",role="DataNode",service="HDFS"}', # Boolean metric @@ -126,4 +126,4 @@ def check_journalnode_metrics( check_datanode_metrics(namespace_arg, product_version_arg) check_journalnode_metrics(namespace_arg, product_version_arg) - print("All expected metrics found") + print("All expected JMX metrics found") diff --git a/tests/templates/kuttl/smoke/test_prometheus_metrics.py b/tests/templates/kuttl/smoke/test_prometheus_metrics.py old mode 100644 new mode 100755 index fb19d908..da394e84 --- a/tests/templates/kuttl/smoke/test_prometheus_metrics.py +++ b/tests/templates/kuttl/smoke/test_prometheus_metrics.py @@ -1,7 +1,10 @@ -# Fetch metrics from the built-in Prometheus endpoint of HDFS components. +# Native Prometheus metrics test +# We use a raw string for "expected_metrics" but still have to escape special regex characters "[", "]", "{" and "}" +# that we expect to be in the metrics string. -import logging +import re import sys +import logging import requests @@ -9,84 +12,63 @@ def check_metrics( namespace: str, role: str, port: int, expected_metrics: list[str] ) -> None: - response: requests.Response = requests.get( - f"http://hdfs-{role}-default-0.hdfs-{role}-default.{namespace}.svc.cluster.local:{port}/prom", + response = requests.get( + f"http://hdfs-{role}-default-metrics.{namespace}.svc.cluster.local:{port}/prom", timeout=10, ) - assert response.ok, "Requesting metrics failed" + assert response.ok, "Requesting metrics failed for {role}." - # Split the response into lines to check for metric names at the beginning of each line. - # This is a bit slower than using a regex but it allows to use special characters like "{}" in metric names - # without needing to escape them. - response_lines = response.text.splitlines() for metric in expected_metrics: - # Use any() with a generator to stop early if the metric is found. - assert any((line.startswith(metric) for line in response_lines)) is True, ( + regex = re.compile(metric, re.MULTILINE) + assert regex.search(response.text) is not None, ( f"Metric '{metric}' not found for {role}" ) -def check_namenode_metrics( - namespace: str, - product_version: str, -) -> None: - expected_metrics: list[str] = [ - # Kind "MetricsSystem" - 'metrics_system_num_active_sources{context="metricssystem",hostname="hdfs-namenode-default-0"}', - # Counter suffixed with "_total" - # The metric attributes can change so we remove them from the expected metric. - # The full name looks like: 'fs_namesystem_files_total{context="dfs",enabledecpolicies="RS-6-3-1024k",hastate="active",totalsynctimes="4 7 ",hostname="hdfs-namenode-default-0"}', - "fs_namesystem_files_total", - # Metric suffixed with "_created" - 'namenode_files_created{processname="NameNode",sessionid="null",context="dfs",hostname="hdfs-namenode-default-0"}', - # Boolean metric - # 'hadoop_namenode_security_enabled{kind="NameNodeStatus",role="NameNode",service="HDFS"}', - # Non-special metric - 'namenode_files_deleted{processname="NameNode",sessionid="null",context="dfs",hostname="hdfs-namenode-default-0"}', +def check_namenode_metrics(namespace: str) -> None: + expected_metrics = [ + r'metrics_system_num_active_sources\{context="metricssystem",hostname="hdfs-namenode-default-\d+"\}', + r'namenode_total_file_ops\{processname="NameNode",sessionid="null",context="dfs",hostname="hdfs-namenode-default-\d+"\}', + r'namenode_files_created\{processname="NameNode",sessionid="null",context="dfs",hostname="hdfs-namenode-default-\d+"\}', + r'namenode_files_deleted\{processname="NameNode",sessionid="null",context="dfs",hostname="hdfs-namenode-default-\d+"\}', ] check_metrics(namespace, "namenode", 9870, expected_metrics) -def check_datanode_metrics( - namespace: str, - product_version: str, -) -> None: +def check_datanode_metrics(namespace: str, datanode_pvc_arg: str) -> None: expected_metrics: list[str] = [ - # Kind "MetricsSystem" - 'metrics_system_num_active_sources{context="metricssystem",hostname="hdfs-datanode-default-0"}', - # Kind "FSDatasetState" suffixed with "_total" - # 'org_apache_hadoop_hdfs_server_datanode_fsdataset_impl_fs_dataset_impl_estimated_capacity_lost_total{context="FSDatasetState",storageinfo="FSDataset{dirpath=\'[/stackable/data/hdd/datanode,/stackable/data/hdd-1/datanode, /stackable/data/ssd/datanode]\'}",hostname="hdfs-datanode-default-0"}', - "org_apache_hadoop_hdfs_server_datanode_fsdataset_impl_fs_dataset_impl_estimated_capacity_lost_total", - # Kind "FSDatasetState" - # 'org_apache_hadoop_hdfs_server_datanode_fsdataset_impl_fs_dataset_impl_capacity{context="FSDatasetState",storageinfo="FSDataset{dirpath=\'[/stackable/data/hdd/datanode, /stackable/data/hdd-1/datanode, /stackable/data/ssd/datanode]\'}",hostname="hdfs-datanode-default-0"}', - "org_apache_hadoop_hdfs_server_datanode_fsdataset_impl_fs_dataset_impl_capacity", - # Kind "DataNodeActivity" suffixed with "_info" - 'datanode_blocks_get_local_path_info{sessionid="null",context="dfs",hostname="hdfs-datanode-default-0"}', - # Kind "DataNodeActivity" - 'datanode_blocks_read{sessionid="null",context="dfs",hostname="hdfs-datanode-default-0"}', - # Counter suffixed with "_total" - # 'org_apache_hadoop_hdfs_server_datanode_fsdataset_impl_fs_dataset_impl_estimated_capacity_lost_total{context="FSDatasetState",storageinfo="FSDataset{dirpath=\'[/stackable/data/hdd/datanode,/stackable/data/hdd-1/datanode, /stackable/data/ssd/datanode]\'}",hostname="hdfs-datanode-default-0"}', - "org_apache_hadoop_hdfs_server_datanode_fsdataset_impl_fs_dataset_impl_estimated_capacity_lost_total", - # Boolean metric - #'hadoop_datanode_security_enabled{kind="DataNodeInfo",role="DataNode",service="HDFS"}', - # Non-special metric - 'jvm_metrics_gc_count{context="jvm",processname="DataNode",sessionid="null",hostname="hdfs-datanode-default-0"}', + r'metrics_system_num_active_sources\{context="metricssystem",hostname="hdfs-datanode-default-\d+', + r'datanode_blocks_get_local_path_info\{sessionid="null",context="dfs",hostname="hdfs-datanode-default-\d+"\}', + r'datanode_blocks_read\{sessionid="null",context="dfs",hostname="hdfs-datanode-default-\d+"\}', + r'jvm_metrics_gc_count\{context="jvm",processname="DataNode",sessionid="null",hostname="hdfs-datanode-default-\d+"\}', ] + # metrics change depending on datanode pvcs + if datanode_pvc_arg == "default": + expected_metrics.extend( + [ + r'org_apache_hadoop_hdfs_server_datanode_fsdataset_impl_fs_dataset_impl_capacity\{context="FSDatasetState",storageinfo="FSDataset\{dirpath=\'\[/stackable/data/data/datanode]\'\}",hostname="hdfs-datanode-default-\d+"\}', + r'org_apache_hadoop_hdfs_server_datanode_fsdataset_impl_fs_dataset_impl_estimated_capacity_lost_total\{context="FSDatasetState",storageinfo="FSDataset\{dirpath=\'\[/stackable/data/data/datanode]\'\}",hostname="hdfs-datanode-default-\d+"\}', + ] + ) + else: + expected_metrics.extend( + [ + r'org_apache_hadoop_hdfs_server_datanode_fsdataset_impl_fs_dataset_impl_capacity\{context="FSDatasetState",storageinfo="FSDataset\{dirpath=\'\[/stackable/data/hdd/datanode, /stackable/data/hdd-1/datanode, /stackable/data/ssd/datanode]\'\}",hostname="hdfs-datanode-default-\d+"\}', + r'org_apache_hadoop_hdfs_server_datanode_fsdataset_impl_fs_dataset_impl_estimated_capacity_lost_total\{context="FSDatasetState",storageinfo="FSDataset\{dirpath=\'\[/stackable/data/hdd/datanode, /stackable/data/hdd-1/datanode, /stackable/data/ssd/datanode]\'\}",hostname="hdfs-datanode-default-\d+"\}', + ] + ) + check_metrics(namespace, "datanode", 9864, expected_metrics) def check_journalnode_metrics( namespace: str, - product_version: str, ) -> None: expected_metrics: list[str] = [ - # Kind "MetricsSystem" - 'metrics_system_num_active_sources{context="metricssystem",hostname="hdfs-journalnode-default-0"}', - # Non-special metric - 'journal_node_bytes_written{context="dfs",journalid="hdfs",hostname="hdfs-journalnode-default-0"}', - # There is no boolean metric in JournalNode. + r'metrics_system_num_active_sources\{context="metricssystem",hostname="hdfs-journalnode-default-\d+"\}', + r'journal_node_bytes_written\{context="dfs",journalid="hdfs",hostname="hdfs-journalnode-default-\d+"\}', ] check_metrics(namespace, "journalnode", 8480, expected_metrics) @@ -94,7 +76,7 @@ def check_journalnode_metrics( if __name__ == "__main__": namespace_arg: str = sys.argv[1] - product_version_arg: str = sys.argv[2] + datanode_pvc_arg: str = sys.argv[2] logging.basicConfig( level="DEBUG", @@ -102,8 +84,8 @@ def check_journalnode_metrics( stream=sys.stdout, ) - check_namenode_metrics(namespace_arg, product_version_arg) - check_datanode_metrics(namespace_arg, product_version_arg) - check_journalnode_metrics(namespace_arg, product_version_arg) + check_namenode_metrics(namespace_arg) + check_datanode_metrics(namespace_arg, datanode_pvc_arg) + check_journalnode_metrics(namespace_arg) - print("All expected metrics found") + print("All expected native metrics found") diff --git a/tests/templates/kuttl/smoke/webhdfs.py b/tests/templates/kuttl/smoke/webhdfs.py index d7bb4c3f..b0ccb40c 100755 --- a/tests/templates/kuttl/smoke/webhdfs.py +++ b/tests/templates/kuttl/smoke/webhdfs.py @@ -17,7 +17,7 @@ def main() -> int: if command == "ls": http_code = requests.get( - f"http://hdfs-namenode-default-0.hdfs-namenode-default.{namespace}.svc.cluster.local:9870/webhdfs/v1/testdata.txt?user.name=stackable&op=LISTSTATUS" + f"http://hdfs-namenode-default-0.hdfs-namenode-default-headless.{namespace}.svc.cluster.local:9870/webhdfs/v1/testdata.txt?user.name=stackable&op=LISTSTATUS" ).status_code if http_code != 200: result = 1 @@ -31,7 +31,7 @@ def main() -> int: ) } http_code = requests.put( - f"http://hdfs-namenode-default-0.hdfs-namenode-default.{namespace}.svc.cluster.local:9870/webhdfs/v1/testdata.txt?user.name=stackable&op=CREATE", + f"http://hdfs-namenode-default-0.hdfs-namenode-default-headless.{namespace}.svc.cluster.local:9870/webhdfs/v1/testdata.txt?user.name=stackable&op=CREATE", files=files, allow_redirects=True, ).status_code