From 01c0dece67626f706d0bc31b9b2f522ba6bdacd7 Mon Sep 17 00:00:00 2001 From: Hsin Chen Date: Mon, 13 Oct 2025 17:11:36 -0700 Subject: [PATCH 1/4] Update alert traige agent to work with optimizer Signed-off-by: Hsin Chen --- .../configs/config_offline_optimizer.yml | 189 ++++++++++++++++++ .../optimizer_prompt.py | 28 +++ .../src/nat_alert_triage_agent/register.py | 14 +- .../telemetry_metrics_analysis_agent.py | 16 +- 4 files changed, 241 insertions(+), 6 deletions(-) create mode 100644 examples/advanced_agents/alert_triage_agent/src/nat_alert_triage_agent/configs/config_offline_optimizer.yml create mode 100644 examples/advanced_agents/alert_triage_agent/src/nat_alert_triage_agent/optimizer_prompt.py diff --git a/examples/advanced_agents/alert_triage_agent/src/nat_alert_triage_agent/configs/config_offline_optimizer.yml b/examples/advanced_agents/alert_triage_agent/src/nat_alert_triage_agent/configs/config_offline_optimizer.yml new file mode 100644 index 0000000000..5b9972331a --- /dev/null +++ b/examples/advanced_agents/alert_triage_agent/src/nat_alert_triage_agent/configs/config_offline_optimizer.yml @@ -0,0 +1,189 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +general: + telemetry: + tracing: + weave: + _type: weave + project: "nat-ata-optimize" + +functions: + hardware_check: + _type: hardware_check + llm_name: tool_reasoning_llm + offline_mode: true + host_performance_check: + _type: host_performance_check + llm_name: tool_reasoning_llm + offline_mode: true + monitoring_process_check: + _type: monitoring_process_check + llm_name: tool_reasoning_llm + offline_mode: true + network_connectivity_check: + _type: network_connectivity_check + llm_name: tool_reasoning_llm + offline_mode: true + telemetry_metrics_host_heartbeat_check: + _type: telemetry_metrics_host_heartbeat_check + llm_name: tool_reasoning_llm + offline_mode: true + metrics_url: http://your-monitoring-server:9090 # Replace with your monitoring system URL if running in live mode + telemetry_metrics_host_performance_check: + _type: telemetry_metrics_host_performance_check + llm_name: tool_reasoning_llm + offline_mode: true + metrics_url: http://your-monitoring-server:9090 # Replace with your monitoring system URL if running in live mode + telemetry_metrics_analysis_agent: + _type: telemetry_metrics_analysis_agent + tool_names: + - telemetry_metrics_host_heartbeat_check + - telemetry_metrics_host_performance_check + llm_name: telemetry_metrics_analysis_agent_llm + optimizable_params: + - prompt + maintenance_check: + _type: maintenance_check + llm_name: maintenance_check_llm + static_data_path: examples/advanced_agents/alert_triage_agent/data/maintenance_static_dataset.csv + categorizer: + _type: categorizer + llm_name: categorizer_llm + prompt_init: + _type: prompt_init + optimizer_llm: optimizer_llm + system_objective: The alert triage agent autonomously investigates infrastructure monitoring alerts, performs root cause analysis, and generates structured diagnostic reports by dynamically selecting and orchestrating diagnostic tools including IPMI hardware checks, network connectivity tests, host performance monitoring, process status verification, and telemetry analysis, then correlating multi-source data through LLM-powered reasoning to classify issues into predefined categories (hardware, software, network, false positive, or requiring investigation), helping security analysts reduce manual triage workload, accelerate incident response times, and maintain consistent investigation quality through standardized evidence collection and automated documentation of findings and recommended remediation actions. + prompt_recombination: + _type: prompt_recombiner + optimizer_llm: optimizer_llm + system_objective: The alert triage agent autonomously investigates infrastructure monitoring alerts, performs root cause analysis, and generates structured diagnostic reports by dynamically selecting and orchestrating diagnostic tools including IPMI hardware checks, network connectivity tests, host performance monitoring, process status verification, and telemetry analysis, then correlating multi-source data through LLM-powered reasoning to classify issues into predefined categories (hardware, software, network, false positive, or requiring investigation), helping security analysts reduce manual triage workload, accelerate incident response times, and maintain consistent investigation quality through standardized evidence collection and automated documentation of findings and recommended remediation actions. + +workflow: + _type: alert_triage_agent + tool_names: + - hardware_check + - host_performance_check + - monitoring_process_check + - network_connectivity_check + - telemetry_metrics_analysis_agent + llm_name: ata_agent_llm + offline_mode: true + # The below paths are only used if offline_mode is true + offline_data_path: examples/advanced_agents/alert_triage_agent/data/offline_data.csv + benign_fallback_data_path: examples/advanced_agents/alert_triage_agent/data/benign_fallback_offline_data.json + optimizable_params: + - agent_prompt + +llms: + ata_agent_llm: + _type: nim + model_name: meta/llama-3.3-70b-instruct + temperature: 0.2 + max_tokens: 2048 + optimizable_params: + - temperature + - top_p + - max_tokens + + tool_reasoning_llm: + _type: nim + model_name: meta/llama-3.3-70b-instruct + temperature: 0.2 + top_p: 0.7 + max_tokens: 2048 + optimizable_params: + - temperature + - top_p + - max_tokens + + telemetry_metrics_analysis_agent_llm: + _type: nim + model_name: meta/llama-3.3-70b-instruct + temperature: 0 + max_tokens: 2048 + optimizable_params: + - temperature + - top_p + - max_tokens + + maintenance_check_llm: + _type: nim + model_name: meta/llama-3.3-70b-instruct + temperature: 0 + max_tokens: 2048 + + categorizer_llm: + _type: nim + model_name: meta/llama-3.3-70b-instruct + temperature: 0 + max_tokens: 2048 + optimizable_params: + - temperature + - top_p + - max_tokens + + nim_rag_eval_llm: + _type: nim + model_name: meta/llama-3.3-70b-instruct + max_tokens: 8 + + optimizer_llm: + _type: nim + model_name: meta/llama-3.3-70b-instruct + temperature: 0.5 + max_tokens: 2048 + + +eval: + general: + output_dir: .tmp/examples/advanced_agents/alert_triage_agent/original/ + workflow_alias: alert_triage_agent_optimizer + dataset: + _type: json + file_path: examples/advanced_agents/alert_triage_agent/data/offline_data.json + + evaluators: + rag_accuracy: + _type: ragas + metric: AnswerAccuracy + llm_name: nim_rag_eval_llm + classification_accuracy: + _type: classification_accuracy + +optimizer: + output_path: .tmp/examples/advanced_agents/alert_triage_agent/optimizer/ + reps_per_param_set: 2 + eval_metrics: + rag_accuracy: + evaluator_name: rag_accuracy + direction: maximize + classification_accuracy: + evaluator_name: classification_accuracy + direction: maximize + + numeric: + enabled: true + n_trials: 3 + + prompt: + enabled: true + prompt_population_init_function: prompt_init + prompt_recombination_function: prompt_recombination + ga_generations: 3 + ga_population_size: 3 + ga_diversity_lambda: 0.3 + ga_parallel_evaluations: 1 \ No newline at end of file diff --git a/examples/advanced_agents/alert_triage_agent/src/nat_alert_triage_agent/optimizer_prompt.py b/examples/advanced_agents/alert_triage_agent/src/nat_alert_triage_agent/optimizer_prompt.py new file mode 100644 index 0000000000..7a68cfaeac --- /dev/null +++ b/examples/advanced_agents/alert_triage_agent/src/nat_alert_triage_agent/optimizer_prompt.py @@ -0,0 +1,28 @@ +class OptimizerPrompts: + AGENT_PROMPT_PURPOSE = """This is the system prompt that instructs the Alert Triage Agent on how to behave and respond to system alerts. It is used as a SystemMessage that's prepended to every LLM conversation, providing the agent with its role and behavior guidelines. + +The prompt should be well-structured and provide specific instructions to help the agent: +- Analyze incoming alerts and identify their type (e.g., InstanceDown, HighCPUUsage) +- Select and use the appropriate diagnostic tools for each alert type (hardware_check, host_performance_check, network_connectivity_check, telemetry_metrics_analysis_agent, monitoring_process_check) +- Avoid calling the same tool repeatedly during a single alert investigation +- Correlate collected data from multiple tools to determine root causes +- Distinguish between true issues, false positives, and benign anomalies +- Generate structured markdown triage reports with clear sections: Alert Summary, Collected Metrics, Analysis, Recommended Actions, and Alert Status + +The prompt should give the agent clear security context and explicit instructions on the expected final report format to ensure consistent, actionable output for system analysts.""" + TELEMETRY_AGENT_PROMPT_PURPOSE = """This is the system prompt for the Telemetry Metrics Analysis Agent, a specialized sub-agent within the alert triage system. It is used as a SystemMessage for a nested agent that the main Alert Triage Agent can call to analyze remotely collected telemetry data. + +This sub-agent receives two inputs (`host_id` and `alert_type`) and is responsible for selecting and using the appropriate telemetry analysis tools to investigate the alert. It has access to two specialized telemetry tools: +- `telemetry_metrics_host_heartbeat_check`: Checks server heartbeat to determine if the host is up and responsive +- `telemetry_metrics_host_performance_check`: Analyzes CPU usage trends over the past 14 days to identify patterns + +The prompt should provide clear instructions to help the agent: +- Understand the alert type and associated host_id provided as input +- Select the correct tool based on the alert type (heartbeat check for instance down alerts, performance check for high CPU usage alerts) +- Execute the selected tool exactly once to gather telemetry data +- Analyze the collected data to identify patterns such as periodic behavior, anomalous peaks, or normal fluctuations +- Return raw data from the tool along with a concise summary of findings +- Highlight any signs that indicate benign (non-critical) behavior, such as normal periodic spikes or consistent uptime, to help de-escalate false alarms +- Provide insights or hypotheses that explain whether the telemetry supports or contradicts the triggered alert + +The prompt should ensure the agent delivers actionable intelligence that helps the main Alert Triage Agent distinguish between genuine issues and false positives.""" \ No newline at end of file diff --git a/examples/advanced_agents/alert_triage_agent/src/nat_alert_triage_agent/register.py b/examples/advanced_agents/alert_triage_agent/src/nat_alert_triage_agent/register.py index 5f87e6c8f0..0d184492a9 100644 --- a/examples/advanced_agents/alert_triage_agent/src/nat_alert_triage_agent/register.py +++ b/examples/advanced_agents/alert_triage_agent/src/nat_alert_triage_agent/register.py @@ -24,6 +24,8 @@ from nat.cli.register_workflow import register_function from nat.data_models.component_ref import LLMRef from nat.data_models.function import FunctionBaseConfig +from nat.data_models.optimizable import OptimizableField +from nat.data_models.optimizable import SearchSpace from nat.profiler.decorators.function_tracking import track_function # flake8: noqa @@ -41,6 +43,7 @@ # Import custom evaluator from .classification_evaluator import register_classification_evaluator from .prompts import ALERT_TRIAGE_AGENT_PROMPT +from .optimizer_prompt import OptimizerPrompts class AlertTriageAgentWorkflowConfig(FunctionBaseConfig, name="alert_triage_agent"): @@ -61,8 +64,15 @@ class AlertTriageAgentWorkflowConfig(FunctionBaseConfig, name="alert_triage_agen benign_fallback_data_path: str | None = Field( default="examples/advanced_agents/alert_triage_agent/data/benign_fallback_offline_data.json", description="Path to the JSON file with baseline/normal system behavior data") - agent_prompt: str = Field(default=ALERT_TRIAGE_AGENT_PROMPT, - description="The system prompt to use for the alert triage agent.") + agent_prompt: str = OptimizableField( + default=ALERT_TRIAGE_AGENT_PROMPT, + description="The system prompt to use for the alert triage agent.", + space=SearchSpace( + is_prompt=True, + prompt=ALERT_TRIAGE_AGENT_PROMPT, + prompt_purpose=OptimizerPrompts.AGENT_PROMPT_PURPOSE, + ) + ) @register_function(config_type=AlertTriageAgentWorkflowConfig, framework_wrappers=[LLMFrameworkEnum.LANGCHAIN]) diff --git a/examples/advanced_agents/alert_triage_agent/src/nat_alert_triage_agent/telemetry_metrics_analysis_agent.py b/examples/advanced_agents/alert_triage_agent/src/nat_alert_triage_agent/telemetry_metrics_analysis_agent.py index 2bcc43df70..6b198e16ee 100644 --- a/examples/advanced_agents/alert_triage_agent/src/nat_alert_triage_agent/telemetry_metrics_analysis_agent.py +++ b/examples/advanced_agents/alert_triage_agent/src/nat_alert_triage_agent/telemetry_metrics_analysis_agent.py @@ -21,18 +21,26 @@ from nat.cli.register_workflow import register_function from nat.data_models.component_ref import LLMRef from nat.data_models.function import FunctionBaseConfig - +from nat.data_models.optimizable import OptimizableField +from nat.data_models.optimizable import SearchSpace from . import utils from .prompts import TelemetryMetricsAnalysisAgentPrompts - +from .optimizer_prompt import OptimizerPrompts class TelemetryMetricsAnalysisAgentConfig(FunctionBaseConfig, name="telemetry_metrics_analysis_agent"): description: str = Field(default=TelemetryMetricsAnalysisAgentPrompts.TOOL_DESCRIPTION, description="Description of the tool for the triage agent.") tool_names: list[str] = [] llm_name: LLMRef - prompt: str = Field(default=TelemetryMetricsAnalysisAgentPrompts.PROMPT, - description="Main prompt for the telemetry metrics analysis agent.") + prompt: str | None = OptimizableField( + default=TelemetryMetricsAnalysisAgentPrompts.PROMPT, + description="The system prompt to use for the alert triage agent.", + space=SearchSpace( + is_prompt=True, + prompt=TelemetryMetricsAnalysisAgentPrompts.PROMPT, + prompt_purpose=OptimizerPrompts.TELEMETRY_AGENT_PROMPT_PURPOSE, + ) + ) @register_function(config_type=TelemetryMetricsAnalysisAgentConfig, framework_wrappers=[LLMFrameworkEnum.LANGCHAIN]) From 10cfa06684f120c72cb84c10bd0325280b920c64 Mon Sep 17 00:00:00 2001 From: Hsin Chen Date: Mon, 13 Oct 2025 18:18:53 -0700 Subject: [PATCH 2/4] Update README to include the mentioning of the optimizer; address comments Signed-off-by: Hsin Chen --- .../alert_triage_agent/README.md | 130 ++++++++++++++++-- ...timizer_prompt.py => optimizer_prompts.py} | 22 +++ .../telemetry_metrics_analysis_agent.py | 2 +- 3 files changed, 144 insertions(+), 10 deletions(-) rename examples/advanced_agents/alert_triage_agent/src/nat_alert_triage_agent/{optimizer_prompt.py => optimizer_prompts.py} (74%) diff --git a/examples/advanced_agents/alert_triage_agent/README.md b/examples/advanced_agents/alert_triage_agent/README.md index 8be1b52f73..2fd11dd2fa 100644 --- a/examples/advanced_agents/alert_triage_agent/README.md +++ b/examples/advanced_agents/alert_triage_agent/README.md @@ -38,6 +38,9 @@ This example demonstrates how to build an intelligent alert triage system using - [Evaluation](#evaluation) - [General](#general) - [Evaluators](#evaluators) + - [Optimization](#optimization) + - [Numeric Optimization](#numeric-optimization) + - [Prompt Optimization](#prompt-optimization) - [Example Usage](#example-usage) - [Running in a live environment](#running-in-a-live-environment) - [Credentials and Access](#credentials-and-access) @@ -292,6 +295,102 @@ Each entry under `evaluators` defines a specific metric to evaluate the pipeline The list of evaluators can be extended or swapped out depending on your evaluation goals. +#### Optimization + +An optional `optimizer` section can be found in [`configs/config_offline_optimizer.yml`](src/nat_alert_triage_agent/configs/config_offline_optimizer.yml). It enables automated hyperparameter tuning and prompt optimization to improve the agent's performance. The optimizer uses the evaluation metrics defined in the `eval` section to search for better configurations. + +```yaml +optimizer: + output_path: .tmp/examples/advanced_agents/alert_triage_agent/optimizer/ + reps_per_param_set: 2 + eval_metrics: + rag_accuracy: + evaluator_name: rag_accuracy + direction: maximize + classification_accuracy: + evaluator_name: classification_accuracy + direction: maximize + + numeric: + enabled: true + n_trials: 3 + + prompt: + enabled: true + prompt_population_init_function: prompt_init + prompt_recombination_function: prompt_recombination + ga_generations: 3 + ga_population_size: 3 + ga_diversity_lambda: 0.3 + ga_parallel_evaluations: 1 +``` + +* `output_path`: Directory where optimization results, including trial configurations, scores, and best parameters, are saved. +* `reps_per_param_set`: Number of times to evaluate each parameter configuration to account for variability in LLM outputs. Higher values provide more reliable metrics but increase evaluation time. +* `eval_metrics`: Dictionary of metrics to optimize. Each entry includes: + * `evaluator_name`: Name of the evaluator from the `eval.evaluators` section. + * `direction`: Either `maximize` or `minimize`, indicating whether higher or lower scores are better. + +##### Numeric Optimization + +The `numeric` section enables automated hyperparameter tuning for numeric parameters like temperature, `top_p`, and `max_tokens`. The optimizer uses Optuna's Bayesian optimization to efficiently search the parameter space. + +* `enabled`: Set to `true` to enable numeric parameter optimization. +* `n_trials`: Number of optimization trials to run. Each trial tests a different combination of hyperparameters. More trials allow for better exploration but require more evaluation time. + +To mark a numeric parameter as optimizable, add `optimizable_params` to the relevant configuration section. For example: + +```yaml +llms: + ata_agent_llm: + _type: nim + model_name: meta/llama-3.3-70b-instruct + temperature: 0.2 + max_tokens: 2048 + optimizable_params: + - temperature + - top_p + - max_tokens +``` + +##### Prompt Optimization + +The `prompt` section of `optimizer` enables genetic algorithm-based prompt optimization to automatically improve prompt instructions. + +* `enabled`: Set to `true` to enable prompt optimization. +* `prompt_population_init_function`: Function name (from the `functions` section) that generates the initial population of prompt variations. +* `prompt_recombination_function`: Function name (from the `functions` section) that combines successful prompts to create new candidates. +* `ga_generations`: Number of generations for the genetic algorithm. +* `ga_population_size`: Size of the population for each generation. +* `ga_diversity_lambda`: Diversity penalty strength to discourage duplicate prompt sets. +* `ga_parallel_evaluations`: Maximum number of concurrent evaluations. + +For more detailed explanations of all genetic algorithm configuration options, see the [Optimizer Reference](../../../docs/source/reference/optimizer.md). + +To mark a prompt as optimizable, add `optimizable_params` to the relevant configuration section. For example: + +```yaml +functions: + telemetry_metrics_analysis_agent: + _type: telemetry_metrics_analysis_agent + tool_names: + - telemetry_metrics_host_heartbeat_check + - telemetry_metrics_host_performance_check + llm_name: telemetry_metrics_analysis_agent_llm + optimizable_params: + - prompt + +workflow: + _type: alert_triage_agent + tool_names: + - hardware_check + - host_performance_check + llm_name: ata_agent_llm + optimizable_params: + - agent_prompt +``` + +Both numeric and prompt optimization can be enabled simultaneously. The optimizer will coordinate both optimization strategies in stages, finding the best overall configuration. ## Example Usage You can run the agent in [offline mode](#running-in-offline-mode) or [live mode](#running-live-with-a-http-server-listening-for-alerts). Offline mode allows you to evaluate the agent in a controlled, offline environment using synthetic data. Live mode allows you to run the agent in a real environment. @@ -429,26 +528,26 @@ To use this mode, first ensure you have configured your live environment as desc Offline mode lets you evaluate the triage agent in a controlled, offline environment using synthetic data. Instead of calling real systems, the agent uses predefined inputs to simulate alerts and tool outputs, ideal for development, debugging, and tuning. To run in offline mode: -1. **Set required environment variables** +#### 1. **Set required environment variables** Make sure `offline_mode: true` is set in both the `workflow` section and individual tool sections of your config file (see [Understanding the configuration](#understanding-the-configuration) section). -2. **How offline mode works:** +#### 2. **How offline mode works:** - The **main CSV offline dataset** (`offline_data_path`) provides both alert details and a mock environment. For each alert, expected tool return values are included. These simulate how the environment would behave if the alert occurred on a real system. - The **JSON offline dataset** (`eval.general.dataset.filepath` in the config) contains a subset of the information from the main CSV: the alert inputs and their associated ground truth root causes. It is used to run `nat eval`, focusing only on the essential data needed for running the workflow, while the full CSV retains the complete mock environment context. - At runtime, the system links each alert in the JSON dataset to its corresponding context in the CSV using the unique host IDs included in both datasets. - The **benign fallback dataset** fills in tool responses when the agent calls a tool not explicitly defined in the alert's offline data. These fallback responses mimic healthy system behavior and help provide the "background scenery" without obscuring the true root cause. -3. **Run the agent in offline mode** - - To run the agent in offline mode with a test question, use the following command structure. Test questions can be found in `examples/advanced_agents/alert_triage_agent/data/offline_data.json`. +#### 3. **Run the agent in offline mode** +##### Single alert run +To run the agent in offline mode with a test alert, use the following command structure. Test alert examples can be found in `examples/advanced_agents/alert_triage_agent/data/offline_data.json`. ```bash nat run --config_file=examples/advanced_agents/alert_triage_agent/configs/config_offline_mode.yml --input "{your_alert_in_json_format}" ``` - **Example:** To run the agent with a test question, use the following command: + **Example:** To run the agent with a test alert, use the following command: ```bash nat run \ @@ -515,7 +614,8 @@ To run in offline mode: 2025-07-21 17:14:45,234 - nat_alert_triage_agent - INFO - Cleaning up ``` - To evaluate the agent, use the following command: +##### Evaluation of a dataset +To evaluate the agent using the test dataset, use the following command: ```bash nat eval --config_file=examples/advanced_agents/alert_triage_agent/configs/config_offline_mode.yml @@ -528,8 +628,20 @@ To run in offline mode: - Run evaluation for the metrics specified in the config `eval.evaluators` - Save the pipeline output along with the evaluation results to the path specified by `eval.output_dir` -4. **Understanding the output** - The output file will be located in the `eval.output_dir` directory and will include a `workflow_output.json` file as part of the evaluation run (alongside other results from each evaluator). This file contains a list of JSON objects, each representing the result for a single data point. Each entry includes the original alert (`question`), the ground truth root cause classification from the dataset (`answer`), the detailed diagnostic report generated by the agentic system (`generated_answer`), and a trace of the agent’s internal reasoning and tool usage (`intermediate_steps`). +##### Optimization over a dataset +To optimize the agent over the test dataset, use the following command: + + ```bash + nat optimize --config_file=examples/advanced_agents/alert_triage_agent/configs/config_offline_optimizer.yml + ``` + + The agent will: + - Load alerts from the JSON dataset specified in the config `eval.general.dataset.filepath` + - Run optimization for the metrics specified in the config `optimizer.eval_metrics` + - Save the optimization results to the path specified by `optimizer.output_dir` + +#### 4. **Understanding the output** +If you run `nat eval` over a dataset, the output file will be located in the `eval.output_dir` directory and will include a `workflow_output.json` file as part of the evaluation run (alongside other results from each evaluator). This file contains a list of JSON objects, each representing the result for a single data point. Each entry includes the original alert (`question`), the ground truth root cause classification from the dataset (`answer`), the detailed diagnostic report generated by the agentic system (`generated_answer`), and a trace of the agent’s internal reasoning and tool usage (`intermediate_steps`). **Sample Workflow Result** ``` diff --git a/examples/advanced_agents/alert_triage_agent/src/nat_alert_triage_agent/optimizer_prompt.py b/examples/advanced_agents/alert_triage_agent/src/nat_alert_triage_agent/optimizer_prompts.py similarity index 74% rename from examples/advanced_agents/alert_triage_agent/src/nat_alert_triage_agent/optimizer_prompt.py rename to examples/advanced_agents/alert_triage_agent/src/nat_alert_triage_agent/optimizer_prompts.py index 7a68cfaeac..0624f6aba6 100644 --- a/examples/advanced_agents/alert_triage_agent/src/nat_alert_triage_agent/optimizer_prompt.py +++ b/examples/advanced_agents/alert_triage_agent/src/nat_alert_triage_agent/optimizer_prompts.py @@ -1,4 +1,26 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + class OptimizerPrompts: + """ + Prompt purpose descriptions for optimizer-enabled agent configurations. + + This class defines the purpose and expected behavior of system prompts used + in the alert triage workflow, enabling prompt optimization through detailed + guidance on what each prompt should accomplish. + """ AGENT_PROMPT_PURPOSE = """This is the system prompt that instructs the Alert Triage Agent on how to behave and respond to system alerts. It is used as a SystemMessage that's prepended to every LLM conversation, providing the agent with its role and behavior guidelines. The prompt should be well-structured and provide specific instructions to help the agent: diff --git a/examples/advanced_agents/alert_triage_agent/src/nat_alert_triage_agent/telemetry_metrics_analysis_agent.py b/examples/advanced_agents/alert_triage_agent/src/nat_alert_triage_agent/telemetry_metrics_analysis_agent.py index 6b198e16ee..88c3cda6d7 100644 --- a/examples/advanced_agents/alert_triage_agent/src/nat_alert_triage_agent/telemetry_metrics_analysis_agent.py +++ b/examples/advanced_agents/alert_triage_agent/src/nat_alert_triage_agent/telemetry_metrics_analysis_agent.py @@ -32,7 +32,7 @@ class TelemetryMetricsAnalysisAgentConfig(FunctionBaseConfig, name="telemetry_me description="Description of the tool for the triage agent.") tool_names: list[str] = [] llm_name: LLMRef - prompt: str | None = OptimizableField( + prompt: str = OptimizableField( default=TelemetryMetricsAnalysisAgentPrompts.PROMPT, description="The system prompt to use for the alert triage agent.", space=SearchSpace( From c6358d1cd8907eed7d15a865673cabf18d58c75b Mon Sep 17 00:00:00 2001 From: Hsin Chen Date: Tue, 21 Oct 2025 10:11:46 -0700 Subject: [PATCH 3/4] Address comments Signed-off-by: Hsin Chen --- .../optimizer_prompts.py | 8 ++++++- .../src/nat_alert_triage_agent/register.py | 16 ++++++------- .../telemetry_metrics_analysis_agent.py | 23 ++++++++++--------- 3 files changed, 26 insertions(+), 21 deletions(-) diff --git a/examples/advanced_agents/alert_triage_agent/src/nat_alert_triage_agent/optimizer_prompts.py b/examples/advanced_agents/alert_triage_agent/src/nat_alert_triage_agent/optimizer_prompts.py index 0624f6aba6..d140309b8d 100644 --- a/examples/advanced_agents/alert_triage_agent/src/nat_alert_triage_agent/optimizer_prompts.py +++ b/examples/advanced_agents/alert_triage_agent/src/nat_alert_triage_agent/optimizer_prompts.py @@ -12,6 +12,12 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +"""Prompt purpose descriptions for the alert triage agent's prompt optimizer. + +This module defines prompt purposes used by the optimizer to understand and improve +system prompts for the alert triage workflow and its sub-agents. +""" + class OptimizerPrompts: """ @@ -47,4 +53,4 @@ class OptimizerPrompts: - Highlight any signs that indicate benign (non-critical) behavior, such as normal periodic spikes or consistent uptime, to help de-escalate false alarms - Provide insights or hypotheses that explain whether the telemetry supports or contradicts the triggered alert -The prompt should ensure the agent delivers actionable intelligence that helps the main Alert Triage Agent distinguish between genuine issues and false positives.""" \ No newline at end of file +The prompt should ensure the agent delivers actionable intelligence that helps the main Alert Triage Agent distinguish between genuine issues and false positives.""" diff --git a/examples/advanced_agents/alert_triage_agent/src/nat_alert_triage_agent/register.py b/examples/advanced_agents/alert_triage_agent/src/nat_alert_triage_agent/register.py index 0d184492a9..9a555e5a51 100644 --- a/examples/advanced_agents/alert_triage_agent/src/nat_alert_triage_agent/register.py +++ b/examples/advanced_agents/alert_triage_agent/src/nat_alert_triage_agent/register.py @@ -64,15 +64,13 @@ class AlertTriageAgentWorkflowConfig(FunctionBaseConfig, name="alert_triage_agen benign_fallback_data_path: str | None = Field( default="examples/advanced_agents/alert_triage_agent/data/benign_fallback_offline_data.json", description="Path to the JSON file with baseline/normal system behavior data") - agent_prompt: str = OptimizableField( - default=ALERT_TRIAGE_AGENT_PROMPT, - description="The system prompt to use for the alert triage agent.", - space=SearchSpace( - is_prompt=True, - prompt=ALERT_TRIAGE_AGENT_PROMPT, - prompt_purpose=OptimizerPrompts.AGENT_PROMPT_PURPOSE, - ) - ) + agent_prompt: str = OptimizableField(default=ALERT_TRIAGE_AGENT_PROMPT, + description="The system prompt to use for the alert triage agent.", + space=SearchSpace( + is_prompt=True, + prompt=ALERT_TRIAGE_AGENT_PROMPT, + prompt_purpose=OptimizerPrompts.AGENT_PROMPT_PURPOSE, + )) @register_function(config_type=AlertTriageAgentWorkflowConfig, framework_wrappers=[LLMFrameworkEnum.LANGCHAIN]) diff --git a/examples/advanced_agents/alert_triage_agent/src/nat_alert_triage_agent/telemetry_metrics_analysis_agent.py b/examples/advanced_agents/alert_triage_agent/src/nat_alert_triage_agent/telemetry_metrics_analysis_agent.py index 88c3cda6d7..ca0484587a 100644 --- a/examples/advanced_agents/alert_triage_agent/src/nat_alert_triage_agent/telemetry_metrics_analysis_agent.py +++ b/examples/advanced_agents/alert_triage_agent/src/nat_alert_triage_agent/telemetry_metrics_analysis_agent.py @@ -23,24 +23,25 @@ from nat.data_models.function import FunctionBaseConfig from nat.data_models.optimizable import OptimizableField from nat.data_models.optimizable import SearchSpace + from . import utils +from .optimizer_prompts import OptimizerPrompts from .prompts import TelemetryMetricsAnalysisAgentPrompts -from .optimizer_prompt import OptimizerPrompts + class TelemetryMetricsAnalysisAgentConfig(FunctionBaseConfig, name="telemetry_metrics_analysis_agent"): description: str = Field(default=TelemetryMetricsAnalysisAgentPrompts.TOOL_DESCRIPTION, description="Description of the tool for the triage agent.") - tool_names: list[str] = [] + tool_names: list[str] = Field(default_factory=list, + description="List of tool names to use for the telemetry metrics analysis agent.") llm_name: LLMRef - prompt: str = OptimizableField( - default=TelemetryMetricsAnalysisAgentPrompts.PROMPT, - description="The system prompt to use for the alert triage agent.", - space=SearchSpace( - is_prompt=True, - prompt=TelemetryMetricsAnalysisAgentPrompts.PROMPT, - prompt_purpose=OptimizerPrompts.TELEMETRY_AGENT_PROMPT_PURPOSE, - ) - ) + prompt: str = OptimizableField(default=TelemetryMetricsAnalysisAgentPrompts.PROMPT, + description="The system prompt to use for the alert triage agent.", + space=SearchSpace( + is_prompt=True, + prompt=TelemetryMetricsAnalysisAgentPrompts.PROMPT, + prompt_purpose=OptimizerPrompts.TELEMETRY_AGENT_PROMPT_PURPOSE, + )) @register_function(config_type=TelemetryMetricsAnalysisAgentConfig, framework_wrappers=[LLMFrameworkEnum.LANGCHAIN]) From 87cedc8bc710fa03c46ffb6f8fb2e5780bbade5c Mon Sep 17 00:00:00 2001 From: Hsin Chen Date: Tue, 21 Oct 2025 10:27:24 -0700 Subject: [PATCH 4/4] Address comments Signed-off-by: Hsin Chen --- .../alert_triage_agent/src/nat_alert_triage_agent/register.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/advanced_agents/alert_triage_agent/src/nat_alert_triage_agent/register.py b/examples/advanced_agents/alert_triage_agent/src/nat_alert_triage_agent/register.py index 9a555e5a51..e85d104b65 100644 --- a/examples/advanced_agents/alert_triage_agent/src/nat_alert_triage_agent/register.py +++ b/examples/advanced_agents/alert_triage_agent/src/nat_alert_triage_agent/register.py @@ -43,7 +43,7 @@ # Import custom evaluator from .classification_evaluator import register_classification_evaluator from .prompts import ALERT_TRIAGE_AGENT_PROMPT -from .optimizer_prompt import OptimizerPrompts +from .optimizer_prompts import OptimizerPrompts class AlertTriageAgentWorkflowConfig(FunctionBaseConfig, name="alert_triage_agent"):