Skip to content

Commit fd9a7e9

Browse files
sbernauermaltesander
authored andcommitted
feat!: Add dedicated -metrics service (#748)
* feat!: Add dedicated -metrics service * changelog * improve doc comment * linter * linter * Add container port * fix port problem * fix tests und role service name * fix linter * Update rust/operator-binary/src/controller.rs Co-authored-by: Malte Sander <contact@maltesander.com> * changelog * changelog * changelog * Update CHANGELOG.md Co-authored-by: Malte Sander <contact@maltesander.com> * fix tests * ruff ruff --------- Co-authored-by: Malte Sander <malte.sander.it@gmail.com> Co-authored-by: Malte Sander <contact@maltesander.com>
1 parent 1fff874 commit fd9a7e9

File tree

15 files changed

+200
-85
lines changed

15 files changed

+200
-85
lines changed

CHANGELOG.md

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,19 @@ All notable changes to this project will be documented in this file.
44

55
## [Unreleased]
66

7+
### Added
8+
9+
- Add a dedicated per-rolegroup `-metrics` Service, which can be used to get Prometheus metrics ([#748]).
10+
- Expose more Prometheus metrics, such as successful or failed bundle loads and information about the OPA environment ([#748]).
11+
12+
### Changed
13+
14+
- BREAKING: The per-rolegroup services now only serves the HTTP port and has a `-headless` suffix to better indicate their
15+
purpose and to be consistent with other operators ([#748]).
16+
- BREAKING: The per-role server service is now prefixed with `-server` to be consistent with other operators ([#748]).
17+
18+
[#748]: https://github.com/stackabletech/opa-operator/pull/748
19+
720
## [25.7.0] - 2025-07-23
821

922
## [25.7.0-rc1] - 2025-07-18

rust/operator-binary/src/controller.rs

Lines changed: 126 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ use stackable_operator::{
5050
core::{DeserializeGuard, error_boundary},
5151
runtime::{controller::Action, reflector::ObjectRef},
5252
},
53-
kvp::{Label, LabelError, Labels, ObjectLabels},
53+
kvp::{LabelError, Labels, ObjectLabels},
5454
logging::controller::ReconcilerError,
5555
memory::{BinaryMultiple, MemoryQuantity},
5656
product_config_utils::{transform_all_roles_to_config, validate_all_roles_and_groups_config},
@@ -93,6 +93,7 @@ pub const BUNDLES_ACTIVE_DIR: &str = "/bundles/active";
9393
pub const BUNDLES_INCOMING_DIR: &str = "/bundles/incoming";
9494
pub const BUNDLES_TMP_DIR: &str = "/bundles/tmp";
9595
pub const BUNDLE_BUILDER_PORT: i32 = 3030;
96+
pub const OPA_STACKABLE_SERVICE_NAME: &str = "stackable";
9697

9798
const CONFIG_VOLUME_NAME: &str = "config";
9899
const CONFIG_DIR: &str = "/stackable/config";
@@ -189,6 +190,12 @@ pub enum Error {
189190
rolegroup: RoleGroupRef<v1alpha1::OpaCluster>,
190191
},
191192

193+
#[snafu(display("failed to apply metrics Service for [{rolegroup}]"))]
194+
ApplyRoleGroupMetricsService {
195+
source: stackable_operator::cluster_resources::Error,
196+
rolegroup: RoleGroupRef<v1alpha1::OpaCluster>,
197+
},
198+
192199
#[snafu(display("failed to build ConfigMap for [{rolegroup}]"))]
193200
BuildRoleGroupConfig {
194201
source: stackable_operator::builder::configmap::Error,
@@ -346,19 +353,20 @@ pub struct OpaClusterConfigFile {
346353
bundles: OpaClusterBundle,
347354
#[serde(skip_serializing_if = "Option::is_none")]
348355
decision_logs: Option<OpaClusterConfigDecisionLog>,
356+
status: Option<OpaClusterConfigStatus>,
349357
}
350358

351359
impl OpaClusterConfigFile {
352360
pub fn new(decision_logging: Option<OpaClusterConfigDecisionLog>) -> Self {
353361
Self {
354362
services: vec![OpaClusterConfigService {
355-
name: String::from("stackable"),
356-
url: String::from("http://localhost:3030/opa/v1"),
363+
name: OPA_STACKABLE_SERVICE_NAME.to_owned(),
364+
url: "http://localhost:3030/opa/v1".to_owned(),
357365
}],
358366
bundles: OpaClusterBundle {
359367
stackable: OpaClusterBundleConfig {
360-
service: String::from("stackable"),
361-
resource: String::from("opa/bundle.tar.gz"),
368+
service: OPA_STACKABLE_SERVICE_NAME.to_owned(),
369+
resource: "opa/bundle.tar.gz".to_owned(),
362370
persist: true,
363371
polling: OpaClusterBundleConfigPolling {
364372
min_delay_seconds: 10,
@@ -367,6 +375,12 @@ impl OpaClusterConfigFile {
367375
},
368376
},
369377
decision_logs: decision_logging,
378+
// Enable more Prometheus metrics, such as bundle loads
379+
// See https://www.openpolicyagent.org/docs/monitoring#status-metrics
380+
status: Some(OpaClusterConfigStatus {
381+
service: OPA_STACKABLE_SERVICE_NAME.to_owned(),
382+
prometheus: true,
383+
}),
370384
}
371385
}
372386
}
@@ -401,6 +415,12 @@ pub struct OpaClusterConfigDecisionLog {
401415
console: bool,
402416
}
403417

418+
#[derive(Serialize, Deserialize)]
419+
struct OpaClusterConfigStatus {
420+
service: String,
421+
prometheus: bool,
422+
}
423+
404424
pub async fn reconcile_opa(
405425
opa: Arc<DeserializeGuard<v1alpha1::OpaCluster>>,
406426
ctx: Arc<Ctx>,
@@ -498,7 +518,10 @@ pub async fn reconcile_opa(
498518
&rolegroup,
499519
&merged_config,
500520
)?;
501-
let rg_service = build_rolegroup_service(opa, &resolved_product_image, &rolegroup)?;
521+
let rg_service =
522+
build_rolegroup_headless_service(opa, &resolved_product_image, &rolegroup)?;
523+
let rg_metrics_service =
524+
build_rolegroup_metrics_service(opa, &resolved_product_image, &rolegroup)?;
502525
let rg_daemonset = build_server_rolegroup_daemonset(
503526
opa,
504527
&resolved_product_image,
@@ -524,6 +547,12 @@ pub async fn reconcile_opa(
524547
.with_context(|_| ApplyRoleGroupServiceSnafu {
525548
rolegroup: rolegroup.clone(),
526549
})?;
550+
cluster_resources
551+
.add(client, rg_metrics_service)
552+
.await
553+
.with_context(|_| ApplyRoleGroupServiceSnafu {
554+
rolegroup: rolegroup.clone(),
555+
})?;
527556
ds_cond_builder.add(
528557
cluster_resources
529558
.add(client, rg_daemonset.clone())
@@ -647,17 +676,14 @@ pub fn build_server_role_service(
647676
/// The rolegroup [`Service`] is a headless service that allows direct access to the instances of a certain rolegroup
648677
///
649678
/// This is mostly useful for internal communication between peers, or for clients that perform client-side load balancing.
650-
fn build_rolegroup_service(
679+
fn build_rolegroup_headless_service(
651680
opa: &v1alpha1::OpaCluster,
652681
resolved_product_image: &ResolvedProductImage,
653682
rolegroup: &RoleGroupRef<v1alpha1::OpaCluster>,
654683
) -> Result<Service> {
655-
let prometheus_label =
656-
Label::try_from(("prometheus.io/scrape", "true")).context(BuildLabelSnafu)?;
657-
658684
let metadata = ObjectMetaBuilder::new()
659685
.name_and_namespace(opa)
660-
.name(rolegroup.object_name())
686+
.name(rolegroup.rolegroup_headless_service_name())
661687
.ownerreference_from_resource(opa, None, Some(true))
662688
.context(ObjectMissingMetadataForOwnerRefSnafu)?
663689
.with_recommended_labels(build_recommended_labels(
@@ -667,19 +693,20 @@ fn build_rolegroup_service(
667693
&rolegroup.role_group,
668694
))
669695
.context(ObjectMetaSnafu)?
670-
.with_label(prometheus_label)
671696
.build();
672697

673-
let service_selector_labels =
674-
Labels::role_group_selector(opa, APP_NAME, &rolegroup.role, &rolegroup.role_group)
675-
.context(BuildLabelSnafu)?;
676-
677698
let service_spec = ServiceSpec {
678-
// Internal communication does not need to be exposed
699+
// Currently we don't offer listener-exposition of OPA mostly due to security concerns.
700+
// OPA is currently public within the Kubernetes (without authentication).
701+
// Opening it up to outside of Kubernetes might worsen things.
702+
// We are open to implement listener-integration, but this needs to be thought through before
703+
// implementing it.
704+
// Note: We have kind of similar situations for HMS and Zookeeper, as the authentication
705+
// options there are non-existent (mTLS still opens plain port) or suck (Kerberos).
679706
type_: Some("ClusterIP".to_string()),
680707
cluster_ip: Some("None".to_string()),
681-
ports: Some(service_ports(opa.spec.cluster_config.tls.is_some())),
682-
selector: Some(service_selector_labels.into()),
708+
ports: Some(data_service_ports_with_tls(opa.spec.cluster_config.tls.is_some())),
709+
selector: Some(role_group_selector_labels(opa, rolegroup)?.into()),
683710
publish_not_ready_addresses: Some(true),
684711
..ServiceSpec::default()
685712
};
@@ -691,6 +718,55 @@ fn build_rolegroup_service(
691718
})
692719
}
693720

721+
/// The rolegroup metrics [`Service`] is a service that exposes metrics and has the
722+
/// prometheus.io/scrape label.
723+
fn build_rolegroup_metrics_service(
724+
opa: &v1alpha1::OpaCluster,
725+
resolved_product_image: &ResolvedProductImage,
726+
rolegroup: &RoleGroupRef<v1alpha1::OpaCluster>,
727+
) -> Result<Service> {
728+
let labels = Labels::try_from([("prometheus.io/scrape", "true")])
729+
.expect("static Prometheus labels must be valid");
730+
731+
let metadata = ObjectMetaBuilder::new()
732+
.name_and_namespace(opa)
733+
.name(rolegroup.rolegroup_metrics_service_name())
734+
.ownerreference_from_resource(opa, None, Some(true))
735+
.context(ObjectMissingMetadataForOwnerRefSnafu)?
736+
.with_recommended_labels(build_recommended_labels(
737+
opa,
738+
&resolved_product_image.app_version_label,
739+
&rolegroup.role,
740+
&rolegroup.role_group,
741+
))
742+
.context(ObjectMetaSnafu)?
743+
.with_labels(labels)
744+
.build();
745+
746+
let service_spec = ServiceSpec {
747+
type_: Some("ClusterIP".to_string()),
748+
cluster_ip: Some("None".to_string()),
749+
ports: Some(vec![metrics_service_port_with_tls(opa.spec.cluster_config.tls.is_some())]),
750+
selector: Some(role_group_selector_labels(opa, rolegroup)?.into()),
751+
..ServiceSpec::default()
752+
};
753+
754+
Ok(Service {
755+
metadata,
756+
spec: Some(service_spec),
757+
status: None,
758+
})
759+
}
760+
761+
/// Returns the [`Labels`] that can be used to select all Pods that are part of the roleGroup.
762+
fn role_group_selector_labels(
763+
opa: &v1alpha1::OpaCluster,
764+
rolegroup: &RoleGroupRef<v1alpha1::OpaCluster>,
765+
) -> Result<Labels> {
766+
Labels::role_group_selector(opa, APP_NAME, &rolegroup.role, &rolegroup.role_group)
767+
.context(BuildLabelSnafu)
768+
}
769+
694770
/// The rolegroup [`ConfigMap`] configures the rolegroup based on the configuration given by the administrator
695771
fn build_server_rolegroup_config_map(
696772
opa: &v1alpha1::OpaCluster,
@@ -923,6 +999,11 @@ fn build_server_rolegroup_daemonset(
923999
);
9241000

9251001
// Add appropriate container port based on TLS configuration
1002+
// If we also add a container port "metrics" pointing to the same port number, we get a
1003+
//
1004+
// .spec.template.spec.containers[name="opa"].ports: duplicate entries for key [containerPort=8081,protocol="TCP"]
1005+
//
1006+
// So we don't do that
9261007
if opa_tls_config.is_some() {
9271008
cb_opa.add_container_port(APP_TLS_PORT_NAME, APP_TLS_PORT.into());
9281009
cb_opa
@@ -1455,36 +1536,35 @@ fn build_prepare_start_command(
14551536
prepare_container_args
14561537
}
14571538

1458-
fn service_ports(tls_enabled: bool) -> Vec<ServicePort> {
1459-
let (port_name, port, target_port) = if tls_enabled {
1460-
(
1461-
APP_TLS_PORT_NAME,
1462-
APP_TLS_PORT,
1463-
IntOrString::String(APP_TLS_PORT_NAME.to_string()),
1464-
)
1539+
fn data_service_ports_with_tls(tls_enabled: bool) -> Vec<ServicePort> {
1540+
let (port_name, port) = if tls_enabled {
1541+
(APP_TLS_PORT_NAME, APP_TLS_PORT)
14651542
} else {
1466-
(
1467-
APP_PORT_NAME,
1468-
APP_PORT,
1469-
IntOrString::String(APP_PORT_NAME.to_string()),
1470-
)
1543+
(APP_PORT_NAME, APP_PORT)
14711544
};
14721545

1473-
vec![
1474-
ServicePort {
1475-
name: Some(port_name.to_string()),
1476-
port: port.into(),
1477-
protocol: Some("TCP".to_string()),
1478-
..ServicePort::default()
1479-
},
1480-
ServicePort {
1481-
name: Some(METRICS_PORT_NAME.to_string()),
1482-
port: 9504, // Arbitrary port number, this is never actually used anywhere
1483-
protocol: Some("TCP".to_string()),
1484-
target_port: Some(target_port),
1485-
..ServicePort::default()
1486-
},
1487-
]
1546+
vec![ServicePort {
1547+
name: Some(port_name.to_string()),
1548+
port: port.into(),
1549+
protocol: Some("TCP".to_string()),
1550+
..ServicePort::default()
1551+
}]
1552+
}
1553+
1554+
fn metrics_service_port_with_tls(tls_enabled: bool) -> ServicePort {
1555+
let port = if tls_enabled {
1556+
APP_TLS_PORT
1557+
} else {
1558+
APP_PORT
1559+
};
1560+
1561+
ServicePort {
1562+
name: Some(METRICS_PORT_NAME.to_string()),
1563+
// The metrics are served on the same port as the HTTP traffic
1564+
port: port.into(),
1565+
protocol: Some("TCP".to_string()),
1566+
..ServicePort::default()
1567+
}
14881568
}
14891569

14901570
/// Creates recommended `ObjectLabels` to be used in deployed resources

rust/operator-binary/src/crd/mod.rs

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ use stackable_operator::{
1717
merge::Merge,
1818
},
1919
k8s_openapi::apimachinery::pkg::api::resource::Quantity,
20-
kube::CustomResource,
20+
kube::{CustomResource, ResourceExt},
2121
product_config_utils::Configuration,
2222
product_logging::{self, spec::Logging},
2323
role_utils::{
@@ -338,7 +338,11 @@ impl v1alpha1::OpaCluster {
338338

339339
/// The name of the role-level load-balanced Kubernetes `Service`
340340
pub fn server_role_service_name(&self) -> Option<String> {
341-
self.metadata.name.clone()
341+
Some(format!(
342+
"{cluster_name}-{role}",
343+
cluster_name = self.name_any(),
344+
role = v1alpha1::OpaRole::Server
345+
))
342346
}
343347

344348
/// The fully-qualified domain name of the role-level load-balanced Kubernetes `Service`

tests/templates/kuttl/aas-user-info/30-assert.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,4 +4,4 @@ kind: TestAssert
44
metadata:
55
name: test-regorule
66
commands:
7-
- script: kubectl exec -n $NAMESPACE test-regorule-0 -- python /tmp/test-regorule.py -u 'http://test-opa-server-default:8081/v1/data/test'
7+
- script: kubectl exec -n $NAMESPACE test-regorule-0 -- python /tmp/test-regorule.py -u 'http://test-opa-server:8081/v1/data/test'

tests/templates/kuttl/ad-user-info/30-assert.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,4 +4,4 @@ kind: TestAssert
44
metadata:
55
name: test-regorule
66
commands:
7-
- script: kubectl exec -n $NAMESPACE test-regorule-0 -- python /tmp/test-regorule.py -u 'http://test-opa-server-default:8081/v1/data/test'
7+
- script: kubectl exec -n $NAMESPACE test-regorule-0 -- python /tmp/test-regorule.py -u 'http://test-opa-server:8081/v1/data/test'

tests/templates/kuttl/keycloak-user-info/30-assert.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,4 +4,4 @@ kind: TestAssert
44
metadata:
55
name: test-regorule
66
commands:
7-
- script: kubectl exec -n $NAMESPACE test-regorule-0 -- python /tmp/test-regorule.py -u 'http://test-opa-server-default:8081/v1/data/test'
7+
- script: kubectl exec -n $NAMESPACE test-regorule-0 -- python /tmp/test-regorule.py -u 'http://test-opa-server:8081/v1/data/test'

0 commit comments

Comments
 (0)