diff --git a/docs/.nav.yml b/docs/.nav.yml old mode 100644 new mode 100755 index 1c97120..c059176 --- a/docs/.nav.yml +++ b/docs/.nav.yml @@ -5,14 +5,37 @@ nav: - Quickstart: quickstart.md - Installation: installation.md - Design: - - design/*.md - - MLConnector: - - mlconnector/Overview.md - - mlconnector/Installation.md - - mlconnector/Step-by-step guide.md + - Architecture: design/architecture.md + - Agents: + - Multi-layer Agents: design/agents.md + - MAPE Tasks: design/mape.md + - SPADE: design/spade.md + - Controllers: design/controllers.md + - Plugins: + - Plugin System: design/plugins/plugin_system.md + - Policy Plugins: design/plugins/policy_plugins.md + - Mechanism Plugins: design/plugins/mechanism_plugins.md + - Descriptions: + - Application Description: design/application-description.md + - System Description: design/system-description.md + - Telemetry: design/telemetry.md + - Agent Configuration: design/agent-configuration.md + - ML Connector: design/ml-connector.md + - User Guide: + - Application Description: + - System Description: + - Policy Plugins: + - Mechanism Plugins: + - MLConnector: + - mlconnector/Overview.md + - mlconnector/Installation.md + - mlconnector/Step-by-step guide.md - Developer Guide: - developer-guide/*.md - Tutorials: - tutorials/*.md - - + - References: + - Python Telemetry API Reference: references/telemetrysdk.md + - Northbound API Reference: references/northbound-api.md + - ML Connector API Reference: references/ml-connector.md + - Command-line Interfaces: references/cli.md diff --git a/docs/CNAME b/docs/CNAME old mode 100644 new mode 100755 diff --git a/docs/assets/img/EN-Funded.png b/docs/assets/img/EN-Funded.png new file mode 100755 index 0000000..be857c1 Binary files /dev/null and b/docs/assets/img/EN-Funded.png differ diff --git a/docs/assets/img/agent_blocks.png b/docs/assets/img/agent_blocks.png new file mode 100755 index 0000000..6099b32 Binary files /dev/null and b/docs/assets/img/agent_blocks.png differ diff --git a/docs/assets/img/agent_high.png b/docs/assets/img/agent_high.png new file mode 100755 index 0000000..e009546 Binary files /dev/null and b/docs/assets/img/agent_high.png differ diff --git a/docs/assets/img/app_description_sequence.png b/docs/assets/img/app_description_sequence.png new file mode 100644 index 0000000..986d424 Binary files /dev/null and b/docs/assets/img/app_description_sequence.png differ diff --git a/docs/assets/img/arch.png b/docs/assets/img/arch.png new file mode 100755 index 0000000..7e314fc Binary files /dev/null and b/docs/assets/img/arch.png differ diff --git a/docs/assets/img/cluster_telemetry.png b/docs/assets/img/cluster_telemetry.png new file mode 100755 index 0000000..14bbef5 Binary files /dev/null and b/docs/assets/img/cluster_telemetry.png differ diff --git a/docs/assets/img/concept.svg b/docs/assets/img/concept.svg new file mode 100755 index 0000000..78171f4 --- /dev/null +++ b/docs/assets/img/concept.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/docs/assets/img/cont_telemetry.png b/docs/assets/img/cont_telemetry.png new file mode 100755 index 0000000..0ac3005 Binary files /dev/null and b/docs/assets/img/cont_telemetry.png differ diff --git a/docs/assets/img/hb_messages.png b/docs/assets/img/hb_messages.png new file mode 100755 index 0000000..50c0fcf Binary files /dev/null and b/docs/assets/img/hb_messages.png differ diff --git a/docs/assets/img/high_level_arch.png b/docs/assets/img/high_level_arch.png new file mode 100755 index 0000000..6328793 Binary files /dev/null and b/docs/assets/img/high_level_arch.png differ diff --git a/docs/assets/img/mlsysops-logo.png b/docs/assets/img/mlsysops-logo.png old mode 100644 new mode 100755 diff --git a/docs/assets/img/mlsysops_logo700x280.png b/docs/assets/img/mlsysops_logo700x280.png new file mode 100644 index 0000000..73e01c6 Binary files /dev/null and b/docs/assets/img/mlsysops_logo700x280.png differ diff --git a/docs/assets/img/node_telemetry.png b/docs/assets/img/node_telemetry.png new file mode 100755 index 0000000..c4899a4 Binary files /dev/null and b/docs/assets/img/node_telemetry.png differ diff --git a/docs/assets/img/otel_deploy_sequence.png b/docs/assets/img/otel_deploy_sequence.png new file mode 100644 index 0000000..3a58900 Binary files /dev/null and b/docs/assets/img/otel_deploy_sequence.png differ diff --git a/docs/assets/img/plugin_exec_flow.png b/docs/assets/img/plugin_exec_flow.png new file mode 100755 index 0000000..b1d3b8d Binary files /dev/null and b/docs/assets/img/plugin_exec_flow.png differ diff --git a/docs/assets/img/system_description_sequence.png b/docs/assets/img/system_description_sequence.png new file mode 100644 index 0000000..3093ef5 Binary files /dev/null and b/docs/assets/img/system_description_sequence.png differ diff --git a/docs/assets/img/telemetry_high.jpg b/docs/assets/img/telemetry_high.jpg new file mode 100755 index 0000000..dd3a264 Binary files /dev/null and b/docs/assets/img/telemetry_high.jpg differ diff --git a/docs/assets/img/telemetry_pipeline.png b/docs/assets/img/telemetry_pipeline.png new file mode 100755 index 0000000..08bd49e Binary files /dev/null and b/docs/assets/img/telemetry_pipeline.png differ diff --git a/docs/assets/javascripts/console-copy.js b/docs/assets/javascripts/console-copy.js old mode 100644 new mode 100755 diff --git a/docs/assets/stylesheets/theme.css b/docs/assets/stylesheets/theme.css old mode 100644 new mode 100755 diff --git a/docs/design/agent-configuration.md b/docs/design/agent-configuration.md new file mode 100644 index 0000000..1690c43 --- /dev/null +++ b/docs/design/agent-configuration.md @@ -0,0 +1,32 @@ +Each agent uses a configuration file that defines its behaviour during instantiation. While agents operating at +different layers of the continuum instantiate different components of the core MLSysOps framework, all agents running on +nodes use the same base instance. However, since node characteristics may vary significantly, each agent can be +individually configured using its corresponding configuration file. + +```YAML +telemetry: + default_metrics: + - "node_load1" + monitor_data_retention_time: 30 + monitor_interval: 10s + managed_telemetry: + enabled: True + +policy_plugins: + directory: "policies" + +mechanism_plugins: + directory: "mechanisms" + enabled_plugins: + - "CPUFrequencyConfigurator" + +continuum_layer: "node" + +system_description: 'descriptions/rpi5-1.yaml' + +behaviours: + APIPingBehaviour: + enabled: False + Subscribe: + enabled: False +``` \ No newline at end of file diff --git a/docs/design/agents.md b/docs/design/agents.md new file mode 100755 index 0000000..775d85b --- /dev/null +++ b/docs/design/agents.md @@ -0,0 +1,36 @@ +The agent component forms the core of the MLSysOps framework. It provides essential integration logic across all layers, +connecting the configuration mechanisms of the underlying system, telemetry data collected from various system +entities (e.g., application, infrastructure), and system configuration policies. Figure 32 illustrates the high-level +architectural structure of the agent. The component exposes two interfaces—the Northbound and Southbound APIs—which +offer structured methods for different system users to interact with it. The Northbound API targets application and +policy developers, whereas the Southbound API is primarily intended for system administrators and mechanism providers. + + + + +The agent follows MAPE (Monitor-Analyze-Plan-Execute) paradigm, which was proposed in 2003 [55] to manage autonomic +systems given high-level objectives from the system administrators, by using the same notion for the main configuration +tasks, depicted as MAPE Tasks in Figure 32. The programming language of choice is Python, and leverages SPADE Python +multi-agent framework [56] to form a network of agents that can communicate through XMPP protocol and a set of defined +messages, providing any necessary functionality from internal tasks that are called behaviours. To achieve seamless +operation between the various sub-modules, the agent implements a set of controllers that are responsible for managing +the various external and internal interactions. +One important design goal of the agent was extensibility. This goal is achieved by defining simple yet powerful +abstractions for two important actors interacting with the system: on one side, the policy developer, who implements the +core management logic, and on the other side, the mechanism provider, who exposes the available configuration options +for a subsystem. Both abstractions are integrated into the MLSysOps agent as plugin functionalities, specifically named +policy and mechanism plugins. The agent's analysis, planning, and execution tasks depend on this plugin system to +generate intelligent configuration decisions—provided by the installed policy plugins—and to apply those decisions to +the underlying system via the available mechanism plugins. + + + +The agent software is structured into different module types: + +- Core Module – Provides foundational functionalities shared by all agent instances (continuum, cluster, and node). +- Layer-Specific Modules – Offer customized implementations specific to the roles of continuum, cluster, or node agents. +- External Interface Modules – Facilitate interactions between the agent framework and external entities. These modules +include the CLI, Northbound API, ML Connector, policy and mechanism plugins. + +This modular architecture ensures consistency in core functionalities across all agents, while also supporting +customization and extension for specific layers and external interactions. diff --git a/docs/design/application-description.md b/docs/design/application-description.md new file mode 100644 index 0000000..e360ae3 --- /dev/null +++ b/docs/design/application-description.md @@ -0,0 +1,14 @@ +The application owner, one of the main actors, interacts with MLSysOps by submitting the application description using +the Command Line Interface (CLI) provided by the framework. The application description depicts the required deployment +constraints (e.g., node-type, hardware, sensor requirements, etc.), which enable various filtering options for the +continuum and cluster layers, that can decide the candidate clusters and nodes, respectively. Having as an example the +registration of a given application, as shown in Figure 42, we perform a Top-Down propagation of the necessary +information to each layer of the continuum. Initially, the Continuum agent creates a Kubernetes Custom Resource that is +propagated to the available Kubernetes clusters. The Cluster agents follow the Kubernetes Operator pattern, so they are +notified of application creation, update, or removal events. Each Cluster agent manages the components that match its +cluster ID, if any. This information is provided by the Continuum agent in the application's Custom Resource. A given +Cluster agent captures the application creation event, parses the description, and deploys the components based on the +provided requirements. The component specifications are also sent to their host nodes, so that the Node agents can store +relevant fields required for any potential reconfiguration/adaptation. + +![app_description_sequence.png](../assets/img/app_description_sequence.png) \ No newline at end of file diff --git a/docs/design/architecture.md b/docs/design/architecture.md new file mode 100755 index 0000000..c94a298 --- /dev/null +++ b/docs/design/architecture.md @@ -0,0 +1,79 @@ +# Architecture + +MLSysOps introduces a hierarchical agent-based architecture composed of three levels: +- Node Agents reside on individual nodes and expose configuration interfaces, monitor resource usage, and provide direct +access to telemetry. +- Cluster Agents coordinate groups of nodes, aggregate telemetry, and issue deployment decisions or adaptation +instructions. +- The Continuum Agent sits at the top level, interfacing with external stakeholders (via northbound APIs), receiving +high-level intents and application descriptors, and coordinating decision-making across slices. +Each layer operates a Monitor–Analyze–Plan–Execute (MAPE) control loop, enabling autonomous adaptation based on local +and global telemetry, system optimization targets, and ML-driven policies. Importantly, this architecture separates +management logic from resource control, allowing for modular evolution and system introspection. + +The MLSysOps agents, supported by ML models, analyse, predict, and optimize resource usage patterns and overall system +performance by allocating, monitoring and configuring the different resources of the underlying layers via the +mechanisms that are implemented in the context of WP3 and manifested in the current deliverable. This integration is a +collaborative effort that draws on the diverse expertise of project partners, each contributing unique insights and +solutions to the multifaceted challenges of cloud and edge computing. This collaborative approach is complemented by an +iterative development process characterized by continuous testing and feedback loops. Such a process ensures that the +mechanisms developed are not only effective in their current context but are also scalable and adaptable to future +technological advancements and operational needs. + + + +The following figure depicts a comprehensive illustration of the MLSysOps hierarchical agent system's placement +and its interactions with two other fundamental subsystems: container orchestration and telemetry. This agent hierarchy +is structured in line with the orchestration architecture, and it is logically divided into three tiers. The +communication among the three subsystems (agents, container orchestration, and telemetry) is facilitated through +designated interfaces at each tier. Moreover, the agent system engages with the continuum level's system agents and +integrates plug-in configuration policies that can use ML models at all levels. At every level, agents utilize mechanism +plugins to implement commands for adjusting available configuration and execution mode options. + + + +Node-level agents’ interface with local telemetry systems and expose configuration knobs. Cluster-level agents +coordinate resource allocation decisions across groups of nodes. At the top level, the continuum agent handles global +orchestration, provides APIs to external actors, and aggregates telemetry data. ML-driven decisions can be made at every +layer, using information for the respective layer. This layered approach facilitates scalability and separation of +concerns while supporting collaboration across orchestration, telemetry, and ML systems. The agent infrastructure +interacts through three distinct types of interfaces. The Northbound API provides access to application developers and +system administrators. The Southbound API interfaces with the underlying telemetry collection and configuration +mechanisms. The ML Connector allows ML models to be plugged into the framework and invoked for training, prediction, and +explanation tasks. +The telemetry subsystem is built upon the OpenTelemetry specification and is responsible for collecting and processing +metrics, logs, and traces. These are abstracted into hierarchical telemetry streams that feed the decision logic of the +agents and the ML models. Data collection happens at the node level, where individual collectors expose metrics either +in raw or aggregated formats. These are processed through transformation pipelines and propagated to cluster and +continuum levels for higher-level aggregation and analysis. + +Application deployment and orchestration are driven by declarative descriptions submitted by application developers and +administrators. These descriptions capture the application's structure, resource requirements, and quality-of-service +objectives. Deployment is handled through standard container orchestration tools, which are extended by the MLSysOps +framework to support advanced placement decisions and runtime adaptation. For far-edge deployments, the framework +introduces a proxy-based architecture involving embServe on constrained devices and a virtual orchestrator service +running inside containerized environments. This approach allows resource-constrained devices to be seamlessly integrated +into the same orchestration and telemetry flows as more capable edge and cloud nodes. + +The object storage infrastructure builds upon and extends SkyFlok, a secure and distributed storage system. In MLSysOps, +this infrastructure supports adaptive reconfiguration of bucket policies based on real-time telemetry and application +usage patterns. The storage system exposes telemetry data regarding latency, bandwidth, and access frequency, enabling +agents and ML models to optimize redundancy and placement decisions without disrupting ongoing operations. +The framework also includes specialized subsystems for anomaly detection and trust assessment. These modules analyze +telemetry data to identify attacks or malfunctions and classify anomalies using ML models. Their outputs are exposed +through the telemetry interface and used by higher-level agents to trigger remediation strategies or adapt orchestration +plans. Trust levels for nodes are computed using a combination of identity, behaviour, and capability metrics, forming a +reputation-based model that influences agent decision-making. + +ML models play a central role in enabling the autonomic operation of the framework. Each level of the agent hierarchy +may employ one or more models, which are integrated via the ML Connector API. These models receive structured telemetry +input and produce configuration decisions, which are interpreted and enacted by the agents. The framework supports +reinforcement learning, continual learning, and federated learning scenarios. In addition, explainability mechanisms are +integrated into the ML workflows to allow system administrators and application developers to understand and audit the +decisions made by the models. + +MLSysOps effectively manages operations by leveraging telemetry data collected from each level, which provides essential +insights. This data, combined with machine learning models, enhances the decision-making process, aligning with both the +application's objectives and the system's requirements. Actions based on these decisions are cascaded and refined from +the top level downwards. The final status and outcomes of these decisions are then made accessible to system Actors. The +design and functionality of the telemetry system are further explaiend in [Telemetry system design](design/telemetry). diff --git a/docs/design/controllers.md b/docs/design/controllers.md new file mode 100644 index 0000000..f306647 --- /dev/null +++ b/docs/design/controllers.md @@ -0,0 +1,21 @@ +Controllers are responsible for coordinating all internal components of the framework, including the MAPE tasks, SPADE, +Policy and Mechanism Plugins, and the Northbound and Southbound APIs. + +- **Application Controller**: Manages the lifecycle of the Analyze loop for each application submitted to the system. When a +new application is submitted, a corresponding Analyze behaviour is initiated, and it is terminated when the application +is removed. + +- **Policy & Mechanism Plugin Controllers**: Responsible for loading, initializing, and configuring policy and mechanism +plugins. During runtime, these controllers provide updated information to the Application Controller, reflecting any +changes in the policy API files. + +- **Agent Configuration Controller**: Handles external configuration commands received from other agents or via the Northbound +API, and propagates them to the appropriate internal components. It is also responsible for loading the initial +configuration file during startup. + +- **Telemetry Controller**: Manages the OpenTelemetry Collector for each agent, including initial deployment and runtime +configuration. Since each collector operates as a pod within the cluster, the Node Agent coordinates with the Cluster +Agent to request deployment and updates, as depicted in Figure 41. Additionally, this controller configures the Monitor +task based on the telemetry metrics being collected. + +![otel_deploy_sequence.png](../assets/img/otel_deploy_sequence.png) \ No newline at end of file diff --git a/docs/design/index.md b/docs/design/index.md old mode 100644 new mode 100755 diff --git a/docs/design/mape.md b/docs/design/mape.md new file mode 100755 index 0000000..af376dd --- /dev/null +++ b/docs/design/mape.md @@ -0,0 +1,56 @@ +The primary responsibility of the agents is to manage the system’s assets—entities and components that can be configured +and/or must be monitored. Typical assets include application components, available configuration mechanisms, and the +telemetry system. The MAPE tasks continuously monitor the state of these assets, analyze their condition, determine +whether a new configuration plan is required, and, if so, create and execute the plan using the mechanism plugins. The +Analyze and Plan tasks invoke the logic implemented in the policy plugins, whereas the Execution task uses the mechanism +plugins. + +## Monitor +The Monitor task runs periodically, collecting information from the environment and updating the agent's internal state. +This information is sourced from the telemetry system, external mechanisms (via Southbound API mechanism plugins), and +other external entities (e.g., other agents). Although there is only a single instance of the Monitor task, it is +adaptive; its configuration can change at runtime based on the agent’s current requirements. A fundamental configuration +parameter is the frequency and type of information retrieved from the telemetry system. For example, when a new +application is submitted to the system, additional telemetry metrics may need to be collected and incorporated into the +internal state. + +## Analyze +For each distinct managed asset, a separate Analyze task thread runs periodically. This thread invokes the corresponding +method of the active policy plugin (see Section 8.3.1) for the specific asset, supplying all necessary inputs, including +telemetry data and relevant system information (e.g., application and system descriptions). Policy plugins may implement +the analysis logic using simple heuristics or employ machine learning models, either through the MLSysOps ML Connector +or via any external service. This task also includes core logic to perform basic failure checks in the event of errors +arising within the policy plugins. +The output of the Analyze task is a binary value (True or False), indicating whether a new configuration plan is +required for the analyzed asset. If the result is True, a new Plan task is initiated. + +## Plan +The Plan task is responsible for generating a new configuration plan and is executed once for each positive result +produced by the Analyze task. The planning logic, implemented by the policy plugins, is invoked upon trigger and +receives all necessary input data. +The output of this task is a dictionary containing values expected by each mechanism plugin. This dictionary represents +the configuration plan to be applied by the respective configuration mechanisms. The result is pushed into a queue and +forwarded to the Plan Scheduler (see Section 8.1.5). + +## Execute +This task is invoked by the Plan Scheduler (see Section 8.1.5) once for each mechanism that must be configured in a +given plan. Based on the dictionary provided by the plan, the corresponding mechanism plugin is called and supplied with +the relevant configuration data. The new configuration is applied using a best-effort approach, without any retry logic, +and the outcome is logged into the state (see Section 8.1.6). In the event of an error, it is expected that the +subsequent run of the Analyze task will detect the issue and handle it appropriately. + +## Plan Scheduler +Each agent supports the concurrent activation of multiple policy and mechanism plugins. As a result, different policies +may generate configuration plans for the same mechanism simultaneously. This situation can lead to conflicts during plan +execution, where multiple plans attempt to apply different—and potentially conflicting—configuration changes to the same +mechanism at the same time. To handle such conflicts, the MLSysOps agent includes a Plan Scheduler module that processes +the queued plans produced by Plan tasks (see Section 8.1.3) in a FIFO manner. The first plan in the queue is applied, +and any subsequent plan targeting a mechanism already configured by a previous plan is discarded. The Plan Scheduler is +designed to be extensible, allowing support for more advanced scheduling policies in the future. +For each scheduled plan, a single Execute task (see Section 8.1.4) is launched to apply the new configuration. + +## State +This is the internal state (memory) of the agents. Each agent contains different information depending on its +environment (continuum level, node type etc.). Some indicative information needed to be kept are information about the +application descriptions as well as system and application telemetry. It is able to store historical snapshots of the +telemetry data that has been acquired by the monitor task \ No newline at end of file diff --git a/docs/design/plugins/mechanism_plugins.md b/docs/design/plugins/mechanism_plugins.md new file mode 100755 index 0000000..621dd63 --- /dev/null +++ b/docs/design/plugins/mechanism_plugins.md @@ -0,0 +1,82 @@ +The MLSysOps framework does not impose assumptions about the underlying system architecture, recognizing that real-world +infrastructures often consist of heterogeneous systems with varying adaptation capabilities and operational +requirements. Different types of nodes offer different configuration options, and nodes operating at higher levels of +the continuum (e.g., cluster or continuum nodes) have distinct configuration needs. To ensure seamless +integration—especially with the policy plugins—MLSysOps defines a standardized plugin interface for system +administrators and mechanism providers, known as **mechanism plugins**. + +To develop a mechanism plugin, a Python script must be provided, implementing three methods: (i) `apply`, (ii) +`get_status`, and (iii) `get_options`. The plugin module may use any required libraries, and it is assumed that any +necessary packages are pre-installed along with the agent. + +The methods are defined as follows \[*footnote: examples refer to CPU frequency control on a node*\]: + +**apply**: This is the primary method invoked by an Execute task. It accepts a single argument, `command`, which is a +dictionary whose structure is defined and documented by the mechanism plugin. This dictionary is produced by the `plan` +method of a policy plugin. The policy developer must be familiar with the available mechanism plugins in the system and +the expected format of the `command` argument. Figure X shows an example of a CPU configuration plugin that utilizes +supporting libraries, as described in Section 3.2. The expected dictionary structure is documented in the method’s +comment section, followed by the call to the underlying library to apply the specified configuration. + +```python +def apply(command: dict[str, any]): + """ + Applies the given CPU frequency settings based on the provided parameters. + + This method modifies the CPU's frequency settings by either applying the changes across + all CPUs or targeting a specific CPU. The modifications set a new minimum and maximum + frequency based on the input values. + + Args: + command (dict): + { + "command": "reset" | "set", + "cpu": "all" | "0,1,2...", + "frequency" : "min" | "max" | "1000000 Hz" + } + """ + # The rest of the code ommited + cpufreq.set_frequencies(command['frequency']) + # ..... +``` + +**Figure X. CPU Frequency configuration mechanism plugin (apply method).** + +**get\_status:** This method must return any available relevant status of the underlying mechanism. Figure X, shows the +CPU frequency configuration mechanism plugin, the method return the current frequencies of all the CPU cores. This is +used by the Monitoring and Execute task, to observe the general status of the mechanism. The mechanism provider may not +implement this method, and opt-in to push the state into the telemetry stream. + +```python +def get_status(): + """ + Retrieves the current CPU frequencies. + + Returns: + list: A list of integers representing the current frequencies of the CPU cores. The + frequencies are usually measured in MHz. + """ + return get_cpu_current_frequencies() +``` + +**Figure X. CPU Frequency mechanism status method.** + +**get\_options:** It returns the available configuration options for the mechanism that is handled by the plugin. In the +example in Figure X, it returns the available CPU frequency steps, that can be used as values in the *frequency* key of +the command dictionary of the apply method. This is meant to be used in a development environment, where MLSysOps +framework provides suitable logging methods. + +```python +def get_options(): + """ + Retrieves the list of CPU available frequencies. + + Returns: + list: A list of frequencies supported by the CPU. + """ + return get_cpu_available_frequencies() +``` + +**Figure X. CPU Frequency mechanism status method.** + +The relationship and interaction between the policy and mechanism plugins are demonstrated in section 2.4.4. \ No newline at end of file diff --git a/docs/design/plugins/plugin_system.md b/docs/design/plugins/plugin_system.md new file mode 100755 index 0000000..d1da207 --- /dev/null +++ b/docs/design/plugins/plugin_system.md @@ -0,0 +1,58 @@ +# MLSysOps Plugin System + +The MLSysOps framework provides a structured plugin mechanism that enables a modular approach to integrating +configuration decision logic with arbitrary mechanisms. Plugins are categorized into two types: core and custom. Core +plugins are always enabled and are used by MLSysOps to perform essential management functions. In contrast, custom +plugins can be configured and activated either during installation or at runtime, allowing flexibility and extensibility +based on system requirements. + +Core plugins provide essential policies and mechanisms. The following table briefly describes the initial plugins that +have been developed, up to the time of writing of this document + +## Execution Flow + +Figure X illustrates the execution flow of the MAPE tasks and the integration of both policy and mechanism plugins. The +Monitor task runs periodically at all times, regardless of whether an application has been submitted, collecting +telemetry data and updating the local state. When a new application is submitted to the system, a separate Analyze task +thread is launched, which uses the analyze method of the corresponding policy plugin. Based on the result, the analysis +session either terminates or triggers a background Plan task. + +The Plan task then invokes the policy plugin’s plan method to generate a new configuration plan. If a valid plan is +produced, it is pushed into a FIFO queue. The Plan Scheduler periodically processes the plans in the queue and initiates +an Execute task for each mechanism included in the output of the policy plugin’s plan method. The Plan Scheduler +enforces a constraint that prevents two different plans from applying configuration changes to the same mechanism within +the same scheduling cycle. + +Finally, the Execute task for each mechanism calls the apply method of the corresponding mechanism plugin. The results +of the configuration are made visible to the next execution of the Analyze task, either via direct status retrieval or +through the telemetry stream. + + + + +## Plugin Inventory + +| Policy Name | Functionality | Type | Continuum Layer | +|------------------------------------------|------------------------------------------------------------------------------------------|--------|-----------------| +| Static Placed Components | It places specific components on a node. | Core | Cluster | +| Dynamic Placed Components | Places application components on a node, based on application description criteria. | Core | Cluster | +| Cluster Component Placement | Places application components in a cluster, based on application description criteria. | Core | Cluster | +| Smart Agriculture Drone Management | Decides on the usage of a drone in a field. | Custom | Cluster | +| Smart Agriculture Application Prediction | It predicts the application performance of smart agriculture. Does not produce any plan. | Custom | Node | +| Smart City Noise Prediction | Predicts the existence of people, using sound sensors. Does not produce any plan. | Custom | Node | +| Smart City Cluster Management | Based on the node-level prediction metrics, it configures the application deployment. | Custom | Cluster | +| CC Storage Gateway Placement | Decindes on the CC storage gateway container. | Custom | Cluster | + +| Mechanism Plugin Name | Functionality | Type | Continuum Layer | +|------------------------------|----------------------------------------------------------------------------|--------|-----------------| +| Fluidity | It manages application component placement in a Kubernetes cluster | Core | Cluster | +| ContinnumComponent Placement | It places the components to specific clusters. | Core | Continuum | +| CPUFrequency | It configures the CPU frequency for the supported architectures. | Core | Node | +| NVIDAGPUFrequency | It configures the GPU Frequency for the supported architectures. | Core | Node | +| FPGAConfigurator | It configures the active bitrstream of Xilinx MPSoC FPGA. | Core | Node | +| vAccelPlugin | Configures the vAccel plugin used by a component. | Core | Node | +| CCStorage | Configures the storage policy of a storage gateway. | Custom | Cluster | +| NetworkRedirection | Configures the network interfaces that are used by application components. | Custom | Cluster | +| ChangePodSpec | It configures the pod specifications for specific values. | Core | Node, Cluster | + + diff --git a/docs/design/plugins/policy_plugins.md b/docs/design/plugins/policy_plugins.md new file mode 100755 index 0000000..af56112 --- /dev/null +++ b/docs/design/plugins/policy_plugins.md @@ -0,0 +1,120 @@ +# Policy Plugins + +Policy plugins are the components responsible for determining if a new adaptation is required and generating new +configuration plans. They follow the MAPE paradigm, specifically implementing the Analyze and Plan tasks. A policy +plugin is implemented as a Python module, which may import and use any external libraries, and must define three +specific methods: (i) initialize, (ii) analyze, and (iii) plan. Each method requires specific arguments and must return +defined outputs. Each method accepts a common argument, context, which can be used to maintain state between different +method calls, as well as across consecutive invocations of the policy plugin. Policy plugins can be bound to a node or +multiple applications; however, they need to decide on at least one mechanism. + +The methods are described as follows: + +**initialize**: This method contains the initialization configuration required by the agent. It is called during the +plugin +loading phase, and it must return the common context dictionary with specific values. An example is shown in Figure 35, +where the policy declares the telemetry configuration it requires, the mechanisms it will analyze and manage, any custom +Python packages needed by the script, and any additional agent configuration parameters. An important parameter is to +declare if this policy will make use of machine learning - this enables the usage of the ML Connector interface and +accordingly configures the mechanism that enables/disables machine learning usage in the framework. + +```python +def initialize(context): + context = { + # The required values + "telemetry": { + "metrics": ["node_load1"], + "system_scrape_internval": "1s" + }, + "mechanisms": [ + "CPUFrequency" + ], + "packages": [ + ## Any possible required Python packages needed + ], + "configuration": { + # Agent configuration + "analyze_interval": "4s" + }, + "machine_learning_usage": false, + # ... any other fields that the policy needs + } + + return context +``` + +**analyze**: The purpose of this method is to analyze the current state of the system and the target application, and +determine whether a new configuration plan might be required. In the example shown in Figure X, the **analyze** function +compares the current telemetry data for the application—retrieved using the application description—with the target +value specified by the application. If the current value exceeds the defined threshold (target), the method concludes +that a new plan is needed. In this example, it is assumed that the monitored application metric should remain below the +specified target. The analyze method can also make use of the ML Connector interface, to make use of machine learning +models deployed from that service. + +```python +def analyze(context, application_description, system_description, current_plan, telemetry, ml_connector): + # policy that checks if application target is achieved + + if telemetry['data']['application_metric'] > application_description['targets']['application_metric']: + return True, context # It did not achieve the target - a new plan is needed + + return False, context +``` + +**Figure X. Analyze method example** + +**plan**: This method decides if a new plan is needed, and if it is positive, generates a new configuration plan based +on all available information in the system, including application and system descriptions, telemetry data, the current +plan, and available assets. It may also leverage the ML Connector interface to invoke machine learning models. In the +example shown in Figure X, the plan method creates a new configuration for the CPU frequency of the node on which it +runs. If the application target is not met, the method sets the CPU to the maximum available frequency; otherwise, it +sets it to the minimum. The configuration values used in the plan are predefined and known to the policy developer, +based on the specifications of the corresponding mechanism plugin (see Section 2.4.2 for examples). + +```python +def plan(context, application_description, system_description, current_plan, telemetry, ml_connector + , available_assets): + if telemetry['data']['application_metric'] > application_description['targets']['application_metric']: + cpu_frequency_command = { + "command": "set", + "cpu": "all", + "frequency": "max" + } + else: + cpu_frequency_command = { + "command": "set", + "cpu": "all", + "frequency": "min" + } + + if new_plan != current_plan: + new_plan = { + "CPUFrequency": cpu_frequency_command + } + return new_plan, context + + return current_plan, context +``` + +**Figure X. Plan method example** + +For both the `analyze` and `plan` methods, the arguments are as follows *(subject to change, and actual values are +documented at the opensource documentation - see Section Y (os))*: + +- **application\_descriptions**: A dictionary or a list of dictionaries containing values from the submitted + applications in the system (see Section X). +- **system\_description**: A dictionary containing system information provided by the system administrator (see Section + X). +- **current\_plan**: The currently active plan for this policy. Since a previously generated plan may have failed, this + argument allows the policy plugin to handle such scenarios appropriately. +- **telemetry**: A dictionary containing telemetry data from both the system and the applications. +- **ml\_connector**: An object handler providing access to the ML Connector service endpoint within the slice. This + argument is empty if the ML Connector service is not available \[\*\]\[see documentation\]. + +As described in Section 2.1, the above plugin methods are invoked and executed within the respective Analyze and Plan +tasks. The Plan Scheduler ensures that any conflicts between different policy-generated plans are resolved and forwards +them to the Execute tasks, which utilize the mechanism plugins to apply the configuration to the system. The declaration +of machine learning model usage for each plugin enables MLSysOps to track where and when machine learning mechanisms are +employed, monitor their performance, and disable plugins that utilize AI tools if requested. The plug-and-play support +further allows for the dynamic modification of configuration logic, enabling agents to adapt to varying operational +scenarios. \ No newline at end of file diff --git a/docs/design/spade.md b/docs/design/spade.md new file mode 100755 index 0000000..0dd66e9 --- /dev/null +++ b/docs/design/spade.md @@ -0,0 +1,189 @@ +# SPADE + +SPADE (Smart Python Development Environment) [56] is a middleware platform for multi-agent systems written in Python, +leveraging the capabilities of instant messaging to manage communication between agents. It simplifies the development +of intelligent agents by combining a robust platform connection mechanism with an internal message dispatcher that +efficiently routes messages to various integrated behaviours. Each agent in SPADE is identified by a unique Jabber ID ( +JID) and connects to an XMPP server using valid credentials. The use of XMPP not only provides persistent connections +and reliable message delivery but also enables real-time presence notifications, which are essential for determining the +availability of agents in a dynamic environment. +The selection of SPADE as the middleware for our multi-agent system is based on several key factors. Its native Python +implementation allows for seamless integration with machine learning libraries and other smart applications, ensuring +that sophisticated functionalities can be embedded directly within the agents. SPADE adheres to the FIPA standards, +promoting interoperability with other agent platforms such as JADE, which is crucial for systems requiring diverse +communication protocols. Furthermore, its modular architecture and open-source nature foster a vibrant community for +continuous improvement, supporting extensibility through plugins, and custom behaviours. This robust, flexible design +not only accelerates the development cycle but also provides a reliable foundation for building complex, intelligent +multi-agent systems. + +## Behaviours + +In SPADE, a behaviour is a modular piece of functionality that encapsulates a specific task or activity for an agent. +Behaviours determine how an agent reacts to incoming messages, processes information, or interacts with its environment. +They can operate in different modes: + +- Cyclic and Periodic behaviours are useful for performing repetitive tasks. +- One-Shot and Time-Out behaviours can be used to perform casual tasks or initialization tasks. +- The Finite State Machine allows more complex behaviours to be built. + +This flexible structure allows us to efficiently delegate tasks without overcomplicating the agent’s core logic, +ensuring clean and maintainable design. +Within our multi-agent system, each agent is assigned specific behaviours based on its role and the tasks it needs to +perform. Each type of agent includes unique behaviours that enable it to carry out specialized tasks. For example, the +Continuum Agent is responsible for interacting with both other agents and the user, incorporating behaviours for +processing user requests, such as responding to a ping message to confirm aliveness, processing application and ML model +descriptions, and checking the deployment status of these applications or models. And other behaviours to process +agent’s interactions that are also common functionalities across the agents, for example handling subscriptions and +heartbeat messages. + +As previously mentioned, agents are structured across three layers: Node Agents, Cluster Agents, and a central Continuum +Agent. Each type of agent is assigned a specific set of behaviours that align with its role in the system. These +behaviours enable the agents to communicate, monitor health, process application logic, and adapt to runtime conditions. +Below is a detailed explanation of each behaviour, followed by a description of which agent types implement them. + +- Heartbeat Behaviour: This behaviour is used by node and cluster agents and is responsible for periodically sending a + signal that indicates the agent is alive. These heartbeat messages are used by higher-layer agents to maintain an + up-to-date view of active agents in the system. +- Subscribe Behaviour: Used by Node and Cluster Agents, this behaviour sends a subscription request to a higher-layer + agent. It allows the agent to join the hierarchical structure and start reporting to its parent, establishing the + control flow across layers. +- Message Receiving Behaviour: Present in all agent types, this behaviour allows an agent to handle incoming messages + from other agents. These messages may contain commands, data updates, or coordination requests. It is essential for + asynchronous interaction across the distributed system. +- Message Sending Behaviour: Also implemented by all agent types, this behaviour handles sending messages to other + agents. It enables agents to initiate communication, send results, or trigger actions elsewhere in the system. +- Management Mode Behaviour: This behaviour allows agents to switch between different decision-making strategies. It is + present across all agents and can dynamically toggle between heuristic control and machine learning-based approaches. + This flexibility allows the system to adjust its intelligence level based on runtime context or user commands. +- Policy & Mechanism Plugin Management Behaviour: Implemented by all agents, this behaviour allows enabling or disabling + plugins at runtime. It supports dynamic reconfiguration of agent logic and enhances adaptability without requiring + redeployment. +- HB Receiver Behaviour: This behaviour is used in Cluster and Continuum Agents. It receives heartbeat signals sent by + lower-layer agents (e.g., Nodes), updates their status, and maintains a local registry of active agents. +- Check Inactive Agents Behaviour: This behaviour complements the heartbeat mechanism. It is used in Cluster and + Continuum Agents to scan the list of subscribed agents and identify those that have stopped sending heartbeats, + indicating failure or disconnection. +- Manage Subscription Behaviour: Implemented by Cluster and Continuum Agents, this behaviour accepts and registers + agents from a lower layer. It enables agents to expand their scope of management as new agents come online and request + to be part of the system. +- API Ping Behaviour: Exclusive to the Continuum Agent, this behaviour allows external components such as the + command-line interface to verify whether the agent is alive by sending ping messages through the North Bound API. +- Check App Deployment Behaviour: This behaviour is also unique to the Continuum Agent. It verifies that the components + of an application have been properly deployed in the framework. It ensures application consistency across the + infrastructure. +- Check ML Deployment Behaviour: Like the previous behaviour but focused on machine learning applications. It verifies + that ML services are correctly deployed and ready for operation. This behaviour also exists only in the Continuum + Agent. +- App Process Behaviour: Implemented in the Continuum Agent, this behaviour analyzes the application description and + determines how and where to deploy its components. It interprets application specifications and translates them into + deployment strategies. +- ML Process Behaviour: This behaviour, also integrated into the Continuum Agent, is responsible for managing the full + lifecycle of machine learning endpoint deployments. It handles the deployment of new ML models or services, monitors + the status of deployed models in real-time, supports redeployment in case of model updates or infrastructure changes, + and manages the deletion of endpoints when they are no longer needed. This ensures a consistent and automated approach + to maintaining ML services across the continuum infrastructure. + +Each of these behaviours is designed to work in harmony within its respective agent, ensuring that the system remains +modular, scalable, and responsive to dynamic environments. As we continue to develop our framework, we can further +refine each behaviour to meet the specific requirements of our agents and enhance the overall efficiency of the +multi-agent system. + +| Behaviour Name | Type | **Type of Agent** | | | +|-----------------------|----------|:-----------------:|:-------:|:----:| +| | | Continuum | Cluster | Node | +| API Ping | Cyclic | x | | | +| Check inactive | Periodic | x | x | | +| Check _app_deployment | One Shot | x | | | +| Check_ml_deployment | One Shot | x | | | +| HB Receiver | Cyclic | x | x | | +| HeartBeat | Periodic | | x | x | +| ML_process | Cyclic | x | | | +| Manage Subscription | Cyclic | x | x | | +| Management mode | Cyclic | x | x | x | +| Message receiving | Cyclic | x | x | x | +| Message sending | One Shot | x | x | x | +| Process | Cyclic | x | | | +| Subscribe | Cyclic | | x | x | +| Policy management | Cyclic | x | x | x | + +## Messages + +SPADE agents communicate by exchanging discrete messages rather than direct method calls, embodying the “computing as +interaction” paradigm of multi-agent systems. As previously mentioned, each agent is identified by a unique ID (JID) ( +username@domain) and connects to an XMPP server using this ID and a password. SPADE relies on the XMPP (Extensible +Messaging and Presence Protocol) as the backbone for all inter-agent communication. This means that every message an +agent sends is transmitted as an XMPP message stanza through the server to the target agent. By using XMPP’s standard +messaging infrastructure, SPADE ensures that agents can reliably send and receive messages in real time, even across +different hosts or network environments. In essence, the XMPP server mediates the exchange, routing each message to the +intended recipient agent (identified by its JID) whether the agents reside on the same machine or are distributed over +the Internet. This decoupled, server-mediated communication model provides a robust and standardized way for agents to +interact, leveraging XMPP’s features for authentication, presence, and security (e.g. encryption) built into the +protocol. +For this XMPP server, the open-source ejabberd service was selected due to its superior scalability, reliability, +performance, security, ease of integration with SPADE, and strong community support. The configuration of the service is +made on the config file providing a domain and each agent can register into it by using a jabber id and a password. Once +the agent is registered it is ready to start exchanging messages with other registered agents. + +### Message Dispatching and Templates + +Within each SPADE agent, an internal message dispatcher handles incoming and outgoing messages. This dispatcher +functions like a mail sorter: when a message arrives for the agent, the dispatcher automatically places it into the +proper “mailbox” for handling, and when the agent sends a message, the dispatcher injects it into the XMPP communication +stream. The key to this routing is the use of message templates. Each behaviour (task) running within an agent can be +associated with a message template that defines the criteria for messages it is interested in. A template can specify +fields such as the sender’s JID, the message’s content or subject, thread/conversation ID, or metadata like performative +and ontology. When an agent receives a message, the dispatcher compares the message against the templates of all active +behaviours and delivers the message to the behaviour whose template it matches. In this way, templates act as filters to +ensure each behaviour only processes relevant messages. For example, a template might match messages with a particular +sender and a specific performative type, so that only messages from that sender with that communicative intent will +trigger the associated behaviour. Messages that meet the template conditions are queued in the target behaviour’s +mailbox, where the behaviour can retrieve them asynchronously. This template-based filtering and routing mechanism +allows multiple behaviours to run concurrently in an agent without interfering with each other, as each behaviour will +only pick up the messages meant for it. It provides a structured approach to message handling, simplifying the +development of complex interactions (such as protocol exchanges) by separating them into different behaviour handlers +listening for different message patterns. + +### FIPA Standards for Structured Communication + +SPADE’s messaging model also draws from established standards to ensure that communications are well-structured and +interoperable. In particular, SPADE supports message formats inspired by the FIPA (Foundation for Intelligent Physical +Agents) Agent Communication Language standards. FIPA defines a set of message fields and interaction protocols intended +to promote clear semantics and compatibility among agents. In SPADE, each message’s metadata can include standard +FIPA-ACL fields like the performative (which describes the intent of the message, such as “inform” or “request”), the +ontology (which defines the domain of discourse or vocabulary of the message content), and the language (the format of +the message content). By allowing these fields in the message structure, SPADE ensures that every message carries not +just raw data but also contextual information about how to interpret that data. Adhering to FIPA communication standards +means that SPADE agents follow a common protocol syntax and semantics, which in principle makes it easier for them to +interact with agents from other FIPA-compliant platforms. In other words, the use of well-defined performatives and +message fields imposes a consistent structure on messages, reducing ambiguity and enhancing interoperability. This +standards-based approach to message handling helps achieve a level of consistency in agent communication, so that the +intent and context of messages are understood in a uniform way across different agents and systems. Ultimately, SPADE’s +alignment with FIPA standards reinforces structured agent interactions and lays the groundwork for integration with the +broader multi-agent ecosystem where such standards are followed. + +## Interactions – Coordination + +Agents rely on behavioural logic and message exchanges to interact and coordinate tasks across the layers of the +continuum. Below is an example illustrating how subscription and heartbeat mechanisms are implemented using agent +behaviours to facilitate this interaction. + +### Subscription and Heartbeat Messages + +Subscription and heartbeat messages are essential processes used to register available nodes within the continuum and to +maintain up-to-date information about the status of nodes and agents across the entire system. + + + +In the subscription process, a lower-layer agent sends a subscription request using the subscribe performative to an +upper-layer agent. Since the behaviour is cyclic, the lower-layer agent continues to send subscription requests until it +receives a subscription accepted message from the upper-layer agent. Once the acknowledgment is received, the agent +stops the cyclic subscription behaviour and initiates the periodic heartbeat behaviour. + +During the heartbeat phase, the lower-layer agent periodically sends heartbeat (HB) messages using the inform +performative. The upper-layer agent, which runs a heartbeat receiver behaviour, constantly listens for these messages +and updates its records based on the latest HB information. + +As shown in Figure above, the interaction between agents through message exchanges and behaviour logic enables the +coordinated execution of various tasks across the continuum. In this example, the focus is on the subscription and +heartbeat process. These coordinated mechanisms allow agents to collaborate and support broader functionalities within +the framework, \ No newline at end of file diff --git a/docs/design/system-description.md b/docs/design/system-description.md new file mode 100644 index 0000000..9625525 --- /dev/null +++ b/docs/design/system-description.md @@ -0,0 +1,11 @@ +The infrastructure descriptions must be provided during the agent installation process. Taking the example of a Node +description registration, it is propagated to the framework using a Bottom-Up approach, in contrast to the application +registration solution (Top-Down), as shown in Figure 43. To this end, we follow the usual node registration protocols, +e.g., node registration to a Kubernetes cluster. In our case, the Node agent sends the respective description to the +Cluster agent, which transforms it into a Custom Resource and applies it to Kubernetes. In addition, the Cluster agent +updates the Cluster formal description (also defined as a Custom Resource) with high-level information that can be used +by the Continuum agent in order to perform filtering based on the available sensors, accelerators, and node types (e.g., +to meet any relevant application deployment constraints). Finally, the Cluster agent notifies the Continuum agent, via +the agent protocol, so that the latter can update its Cluster-related structures. + +![system_description_sequence.png](../assets/img/system_description_sequence.png) \ No newline at end of file diff --git a/docs/design/telemetry.md b/docs/design/telemetry.md new file mode 100755 index 0000000..e9452bd --- /dev/null +++ b/docs/design/telemetry.md @@ -0,0 +1,94 @@ +# Telemetry System + +The telemetry plane of MLSysOps collects the data necessary from all layers to drive the +configuration decisions, potentially made using ML inference and continual training, with appropriate aggregation and +abstraction towards the higher layers of the hierarchy. This section will give the technical insights of the MLSysOps +telemetry system that supports the collection of performance metrics across MLSysOps resource management layers. + +## OpenTelemetry Specification + +The MLSysOps framework operates on different layers of the cloud-edge-far-edge continuum and manages highly +heterogeneous systems and applications while simultaneously providing appropriate observability user interfaces, as +illustrated in Figure 5. Given the diversity of tools and vendors involved, a vendor- and tool-agnostic protocol for +collecting and transmitting telemetry data is essential. + + + +[OpenTelemetry](https://opentelemetry.io), a well-defined open-source system, provides the foundation for MLSysOps +observability capabilities. +OpenTelemetry is an all-around observability framework, that handles all necessary signals (categories of telemetry +data), such as traces, metrics, and logs. MLSysOps supports and uses all three signal categories. The basic signal that +is exposed to the framework’s users are metrics, whereas logs and traces are used for debugging and profiling purposes. + +OpenTelemetry offers the API as well as the software components that implement various telemetry functionalities, in the +form of a Software Development Kit (SDK). + + + +The central software component is the OpenTelemetry Collector (OTEL Collector), a vendor-agnostic implementation of the +telemetry pipeline (as presented in Figure 6) that consists of three stages: i) receive, ii) process, and iii) export. +This component is versatile and flexible, being able to receive telemetry data in multiple formats, process them in +different ways, and export them to other systems. The OTEL Collectors can operate in two different modes: i) Agent and +ii) Gateway mode. The main difference is that in Gateway mode, the collector receives telemetry data from other +collectors (that, in turn, operate in Agent or Gateway mode). This makes the gateway a centralized aggregator for any +underlying OTEL collectors. + +The OTEL Collector [4] baseline implementation offers the minimum required functionality, which is a set of receivers +and exporters that communicate using data conforming to the OpenTelemetry Data Specification, using either HTTP or gRPC +protocols, as well as specific processing capabilities, like batching and memory control. The OpenTelemetry collector +repository [4] includes multiple plugin implementations for all three stages of the telemetry pipeline, like Prometheus +receivers & exporters, as well as a routing processor that can create different telemetry streams in the OTEL Collector +pipeline. In MLSysOps, we fully leverage the available plugins to achieve the desired functionality for the OTEL +Collectors at each level of the MLSysOps hierarchy: node, cluster, and continuum. + +The OpenTelemetry specification defines a way to collect and transfer telemetry data without making any assumptions +about the storage of the data. MLSysOps follows the same paradigm and does not make any assumptions for this matter, +although, for the development of the framework, we use the Mimir metrics database [5], which is an open-source software +that is suitable for the needs of our telemetry system. Mimir is deployed at the highest level (continuum), storing +telemetry data and offering an API for flexible telemetry data querying through its proprietary PromQL interface,. +PromQL [6] is quite versatile and powerful for time-series data, allowing different clients (such as ML models) to +easily consume the data they need, using further aggregation and transformation functions. On top of the Mimir database, +MLSysOps uses Grafana [7] for data visualization, Loki [8] for the logs messages database, and Grafana Tempo [9] for +trace storage. All these components belong to the same ecosystem and work seamlessly with each other without a need for +further configuration. We leverage the PromQL for the MLSysOps Telemetry API at the higher levels, implementing the +necessary functionality for providing telemetry data to other entities of the system, and the Prometheus metric format +for providing telemetry data at the same level. + +The deployment of the OTEL Collectors in each node is performed using the appropriate containers, and the orchestration +is done through the same orchestrator as the ones used for application components. This simplifies the management and +the connectivity between the application components and the OTEL Collectors. The deployment configuration is done +transparently and automatically by the MLSysOps framework, which is responsible for the initial deployment and +configuration of the OTEL Collector pods on every node and layer, as well as the dynamic reconfiguration of the +collectors at runtime. + +## Node Level Telemetry + +On each node, the OTEL Collector operates in Agent mode. As Figure 7 illustrates, it receives data from any entity that +needs to emit telemetry data into the telemetry system. This is done either through the available interfaces, as they +are discussed in Section 2.7, or through the appropriate configuration of the OTEL Collector enabling the desired +receiver. It then periodically pushes telemetry data to the OTEL Collector, which operates in Gateway mode at the +cluster level. OTEL Collectors in each node can process the data in batches, keeping the overhead low. For instance, for +an application component sending telemetry data at a high rate, the collector agent can store the data in memory ( +perhaps even process them to perform filtering and aggregation; see next) and then forward it to the gateway at a lower +rate. +It is also possible to apply transformations and aggregations to the raw data before forwarding them to the gateway +collector. Note that the OTEL Collector at the node level can route the raw and the transformed telemetry data to +different exporters. The raw data exporting route, provides an endpoint that can be queried locally. + + + +## Cluster Level Telemetry + +At the cluster level, different components need to be monitored. The telemetry data in this layer must describe the +status of the cluster rather than of a specific node. The main source of information for this level is the orchestration +manager. There is, therefore, a dedicated OTEL Collector configured to collect metrics from the orchestrator. + + + +## Continuum Level Telemetry + +At the highest level of the MLSysOps framework, telemetry data is used not only for configuration decisions but also for +informational and debugging purposes. This layer also includes components for telemetry data storage and visualization. + + + diff --git a/docs/developer-guide/Code-of-Conduct.md b/docs/developer-guide/Code-of-Conduct.md old mode 100644 new mode 100755 diff --git a/docs/developer-guide/contribute.md b/docs/developer-guide/contribute.md old mode 100644 new mode 100755 diff --git a/docs/developer-guide/development.md b/docs/developer-guide/development.md old mode 100644 new mode 100755 diff --git a/docs/developer-guide/index.md b/docs/developer-guide/index.md old mode 100644 new mode 100755 diff --git a/docs/developer-guide/maintainers.md b/docs/developer-guide/maintainers.md old mode 100644 new mode 100755 diff --git a/docs/hooks/__pycache__/copyright.cpython-310.pyc b/docs/hooks/__pycache__/copyright.cpython-310.pyc new file mode 100755 index 0000000..4bdf478 Binary files /dev/null and b/docs/hooks/__pycache__/copyright.cpython-310.pyc differ diff --git a/docs/hooks/copyright.py b/docs/hooks/copyright.py old mode 100644 new mode 100755 diff --git a/docs/index.md b/docs/index.md old mode 100644 new mode 100755 index 22a6d3f..0c3ab5c --- a/docs/index.md +++ b/docs/index.md @@ -1,17 +1,80 @@ # MLSysOps Framework +The MLSysOps framework operates in the context of a heterogeneous, multi-layered computing continuum, ranging from +centralized cloud infrastructures to resource-constrained far-edge devices. The objective of the framework is to enable +autonomic, explainable, and adaptive system management by leveraging artificial intelligence, with minimal human intervention. + + + +The design of MLSysOps is guided by a system model that introduces the concept of a slice — a logical grouping of computing, +networking, and storage resources across the continuum that is managed as a unit. Each slice is governed by its own +deployment of the MLSysOps control plane and encompasses physical or virtualized resources at different layers. + +In essence, the framework operates as an abstraction middleware between the participating entities. + ## Key features +* Kubernetes deployment management. +* Multi-cluster management using Karmada. +* Dynamically configured telemetry system. +* Plugin systems for configuration policies and mechanisms. +* Application deployment model, using Kubernetes Custom Resource Definitions (CRDs). +* System infrastructure inventory, using Kubernetes Custom Resource Definition (CRDs). +* REST API endpoints (Northbound API service). It can be used by a proprietary CLI. +* ML Connector service, for easy ML Model management, deployment, retraining, and explainability. +* Node level management. +* Deploy using different container runtimes. +* Resource contrainted devices management (Far-Edge devices). +* Storage service managed by the framework. ## Use cases +* Optimize the deployment of an application in a continuum system slice, using smart policies, that can make use of ML models. + * Achieve application targets, specified in the application deployment description. + * Optimize system targets, based on system descriptions. +* Implement arbitrary configuration mechanisms, and expose them to configuration policies. ## Current support +| Feature | Status | Stability | | +|-----------------------------------|--------------------|------------|--| +| Kubernetes Management | :heavy_check_mark: | Alpha | | +| Multi-cluster deployment | :heavy_check_mark: | Alpha | | +| Multi-cluster networking | :x: | - | | +| Dynamic telemetry system | :heavy_check_mark: | Alpha | | +| Plugin system | :heavy_check_mark: | Alpha | | +| Application & System descriptions | :heavy_check_mark: | Alpha | | +| Basic ML management | :heavy_check_mark: | Alpha | | +| Node level management | :x: | - | | +| Far-edge devices | :x: | - | | +| Managed Storage service | :x: | - | | + + + +**Mechanism Plugins** + +| Plugin Name | Description | Configuration options | +|-------------|-----------------------------------------------------------------|-------------------------------------| +| Fluidity | Provides the capability to manage Kubernetes pods and services. | * Deploy/remove/relocate components | +| | | | + +**Policy Plugins** + +| Plugin Name | Description | +|------------------------|--------------------------------------------| +| staticPlacedComponents | It provides the logic to place components. | +| | | + ## Quick links - [Contributing](developer-guide/contribute/) +- [Website](https://mlsysops.eu) + +--- +## Acknowledgements + + diff --git a/docs/installation.md b/docs/installation.md old mode 100644 new mode 100755 index ea9692b..f850a09 --- a/docs/installation.md +++ b/docs/installation.md @@ -8,22 +8,119 @@ We will be installing and setting up each component individually: ### Core Framework -TBC +# MLSysOps Framework Installation +The main prerequisite is that there is Karmada instance installed, with at least one Kubernetes cluster registered. +We assume that the Karmada is installed in a standalone cluster. +Karmada instance should include the `karmada-search` plugin. -### Plugins +MLSysOps Framework consists of three main components called MLSysOps Agents. These components require the following +services to operate before starting: -TBC +* Ejabberd XMPP Server +* Redis +* Docker installed in Karmada-Management VM -### Mechanisms +There are two services that provide additional functionalities to the user: -TBC +- **Northbound API**: This service is part of the MLSysOps agents. It provides endpoints for controlling the components and behaviors of the agents. +- **ML Connector**: This service is responsible for managing and deploying Machine Learning models. It exposes its functionality through a separate API. -## Install required dependencies +To ensure the correct bootstrap, the agents should start in the following order: +1. Continuum agent +2. Cluster agent +3. Node agents -TBC -> Note: Be aware that some instructions might override existing tools and services. +All the deployments take place in a Kubernetes cluster, in separate namespace 'mlsysops-framework'. All the third-party services, +as well as the Continuum agent are deployed in the managament cluster, the same that is installed in karmada host. + + +# System descriptions preparation +Before the installation process takes place, system descriptions for every layer must be prepared. +A system description is a YAML file, implemented as Kubernetes CRDs. +Examples can be found in `descriptions/` directory. +The descriptions for each layer reside in the respectively named directory: continuum, clusters, nodes. +Each file MUST have the name of the corresponding hostname, followed by the .yaml or .yml suffix. +For example, a machine at the node level, with hostname `node-1`, should have a description file named `node-1.yaml` under +the directory `nodes/`. + +* **Continuum** level descriptions, require one single file, that declare the continuumID and the clusters that we allow MLSysOps to manage. +* **Cluster** level descritptions, require a file for each cluster registered in Karmada. It contains the clusterID and a list of node hostnames, that MLSysOps is allowed to manage. +* **Node** level descriptions, contain the detailed information about the node resources. Example [here](descriptions/nodes/node-1.yaml). + +# Option 1: Automated Deployment + +MLSysOps CLI tool can be used to automatically deploy all the necessary components. +It needs the kubeconfigs of Karmada host cluster and Karmada API. + +- `export KARMADA_HOST_KUBECONFIG=` +- `export KARMADA_API_KUBECONFIG=` +- `export KARMADA_HOST_IP=` + +And then execute the CLI command inside `deployments` directory, with `descriptions` directory files prepared: +- `python3 deploy.py` + +# Option 2: Manual Deployment + +## Continuum - Management Cluster +In Karmada host cluster: `export KUBECONFIG=` + + +- Create namespace + - `kubectl apply -f namespace.yml` + +- Install Required services + - Change `POD_IP` to Karmada host IP in `xmpp/deployment.yaml`. + - `kubectl apply -f xmpp/deployment.yaml` + - Change `{{ KARMADA_HOST_IP }}` to Karmada host IP in `api-service-deployment.yaml`. + - `kubectl apply -f api-service-deployment.yaml` + - Change `{{ KARMADA_HOST_IP }}` to Karmada host IP in `redis-stack-deployment.yaml`. + - `kubectpl apply -f redis-stack-deployment.yaml` + - `docker compose up -d -f ` -The following packages are required to complete the installation. Depending -on your specific needs, some of them may not be necessary in your use case. +- Apply RBAC + - `kubectl apply -f mlsysops-rbac.yaml` + +- Attach Karmada API kubeconfig as ConfigMap + - `kubectl create configmap continuum-karmadapi-config --from-file= --namespace=mlsysops-framework` +- +- Attach Continuum system description as ConfigMap + - `kubectl create configmap continuum-system-description --from-file= --namespace=mlsysops-framework` + +- Start continuum agent + - `kubectl apply -f continuum-agent-daemonset.yaml` + +--- +In Karmada API Server: `export KUBECONFIG=` + +## Cluster deployments + +- Setup Karmada propagation policies + - `kubectl apply -f cluster-propagation-policy.yaml` + - `kubectl apply -f propagation-policy.yaml` + +- Create the namespace + - `kubectl apply -f namespace.yaml` + +- Apply RBAC + - `kubectl apply -f mlsysops-rbac.yaml` + +- Create a configmap based on the system description, using as the namefile, the hostname of the cluster manage node hostname + - `kubectl create configmap cluster-system-description --from-file=descriptions/clusters --namespace=mlsysops-framework` + +- Apply the daemonset YAML file + - Change `{{ KARMADA_HOST_IP }}` to Karmada host IP in `cluster-agents-daemonset.yaml`. + - `kubectl apply -f cluster-agents-daemonset.yaml` + +## Nodes deployment + +- Namespaces and RBAC were created with the cluster setup. +- Prepare the system description, for each node, and name each file with the host name. +- Create a configmap based on the system description, for each node. + - `kubectl create configmap node-system-descriptions --from-file=descriptions/nodes --namespace=mlsysops-framework` +- env variable. +- Apply the daemonset YAML file + - `kubectl apply -f node-agents-daemonset.yaml` + +> Note: Be aware that some instructions might override existing tools and services. diff --git a/docs/overrides/main.html b/docs/overrides/main.html old mode 100644 new mode 100755 diff --git a/docs/overrides/partials/breadcrumb.html b/docs/overrides/partials/breadcrumb.html old mode 100644 new mode 100755 diff --git a/docs/quickstart.md b/docs/quickstart.md old mode 100644 new mode 100755 index 00e83aa..12f403f --- a/docs/quickstart.md +++ b/docs/quickstart.md @@ -1,7 +1,234 @@ This document acts as a quickstart guide to showcase indicative features of the -`MLSysOps framework`. Please refer to the [installation guide](../installation) +`MLSysOps Framework`. Please refer to the [installation guide](../installation.md) for more detailed installation instructions, or the [design](../design#architecture) document for more details regarding `MLSysOps`'s architecture. -`` +## MLSysOps Framework Installation +The main prerequisite is that there is Karmada instance installed, with at least one Kubernetes cluster registered. +We assume that the Karmada is installed in a standalone cluster. +Karmada instance should include the `karmada-search` plugin. +You can follow the instructions in [Testbed installation](testbed.md) to create the appropriate environment. + +MLSysOps Framework consists of three main components called MLSysOps Agents. These components require the following +services to operate before starting: + +* Ejabberd XMPP Server +* Redis +* Docker installed in Karmada-Management VM + +There are two services that provide additional functionalities to the user: + +- **Northbound API**: This service is part of the MLSysOps agents. It provides endpoints for controlling the components and behaviors of the agents. +- **ML Connector**: This service is responsible for managing and deploying Machine Learning models. It exposes its functionality through a separate API. + +To ensure the correct bootstrap, the agents should start in the following order: +1. Continuum agent +2. Cluster agent +3. Node agents + + +All the deployments take place in a Kubernetes cluster, in separate namespace 'mlsysops-framework'. All the third-party services, +as well as the Continuum agent are deployed in the managament cluster, the same that is installed in karmada host. + +#### Step 1: Clone the repo + +`git clone https://github.com/mlsysops-eu/mlsysops-framework` + +and enter deployments directory + +`cd deployments` + +#### Step 2: System descriptions preparation +Before the installation process takes place, system descriptions for every layer must be prepared. +A system description is a YAML file, implemented as Kubernetes CRDs. +Examples can be found in `descriptions/` directory. +The descriptions for each layer reside in the respectively named directory: continuum, clusters, nodes. +Each file MUST have the name of the corresponding hostname, followed by the .yaml or .yml suffix. +For example, a machine at the node level, with hostname `node-1`, should have a description file named `node-1.yaml` under +the directory `nodes/`. + +* **Continuum** level descriptions, require one single file, that declare the continuumID and the clusters that we allow MLSysOps to manage. +* **Cluster** level descritptions, require a file for each cluster registered in Karmada. It contains the clusterID and a list of node hostnames, that MLSysOps is allowed to manage. +* **Node** level descriptions, contain the detailed information about the node resources. Example [here](descriptions/nodes/node-1.yaml). + +Before deploying, prepare system descriptions as Kubernetes CRDs: + +- Stored in the `descriptions/` directory + +### 📁 File structure: + +``` +descriptions/ +├── continuum/ +│ └── .yaml +├── clusters/ +│ └── .yaml +└── nodes/ + └── .yaml +``` + +Descriptions define IDs, managed components, and resource details. All files are required before installation. + +--- + +### Step 3: Deploy the Framework + +There are two ways to deploy the framework: + +#### ✅ Option 1: Automated using the MLSysOps CLI + +You can install the CLI in two ways: + +**From TestPyPI:** + +```bash +pip install -i https://test.pypi.org/simple/ mlsysops-cli==0.1.9 +``` + +**From GitHub (includes deployments folder):** + +```bash +git clone https://github.com/marcolo-30/mlsysops-cli.git +cd mlsysops-cli +pip install -e . +``` + +This exposes the `mls` command. + +**Set environment variables:** + +```bash +export KARMADA_HOST_KUBECONFIG= +export KARMADA_API_KUBECONFIG= +export KARMADA_HOST_IP= +``` + +**Run deployment:** + +```bash +cd deployments/ +mls framework deploy-all +``` + +This will: +- Deploy core services (ejabberd, redis, API service) +- Register system descriptions +- Deploy all agents in correct order + +**Alternative:** +You can also run the CLI script directly: + +```bash +cd deployments +python3 deploy.py +``` + +Wait for all pods to be created: + +```bash +kubectl get pods -n mlsysops-framework +``` + +--- + +#### 🛠 Option 2: Manual Deployment + +Follow the order below to deploy manually if you prefer full control. + +### 📍 Management Cluster (Continuum) + +```bash +export KUBECONFIG= +``` + +- Create namespace: +```bash +kubectl apply -f namespace.yaml +``` + +- Install services: +```bash +kubectl apply -f xmpp/deployment.yaml +kubectl apply -f api-service-deployment.yaml +kubectl apply -f redis-stack-deployment.yaml +``` + +- Start ML Connector: +```bash +docker compose -f mlconnector.docker-compose.yaml up -d +``` + +- Apply RBAC: +```bash +kubectl apply -f mlsysops-rbac.yaml +``` + +- Add configuration and system descriptions: +```bash +kubectl create configmap continuum-karmadapi-config --from-file= --namespace=mlsysops-framework +kubectl create configmap continuum-system-description --from-file=descriptions/continuum/.yaml --namespace=mlsysops-framework +``` + +- Start the Continuum Agent: +```bash +kubectl apply -f continuum-agent-daemonset.yaml +``` + +### 📍 Karmada API Cluster (Cluster Agents) + +```bash +export KUBECONFIG= +``` + +- Apply policies and namespace: +```bash +kubectl apply -f cluster-propagation-policy.yaml +kubectl apply -f propagation-policy.yaml +kubectl apply -f namespace.yaml +kubectl apply -f mlsysops-rbac.yaml +``` + +- Add system descriptions: +```bash +kubectl create configmap cluster-system-description --from-file=descriptions/clusters --namespace=mlsysops-framework +``` + +- Start Cluster Agents: +```bash +kubectl apply -f cluster-agents-daemonset.yaml +``` + +### 📍 Node Agents + +- Ensure node descriptions are in place +- Add them via ConfigMap: +```bash +kubectl create configmap node-system-descriptions --from-file=descriptions/nodes --namespace=mlsysops-framework +``` + +- Start Node Agents: +```bash +kubectl apply -f node-agents-daemonset.yaml +``` + +--- + +#### Step 4: Deploy a test application + +We use a simple TCP Client - Server application, that send messages periodically. +The files are in `tests/application` of the repo. + +Update the test_CR and test_MLSysOps_description, with the node names of the cluster and the clusterID. + +apply the CR: +`kubectl apply -f tests/application/test_CR.yaml` + +or the description via the MLS CLI: + +`cli/mls.py apps deploy-app --path tests/application/test_MLSysOps_descirption.yaml` + +You can watch the pods starting and be managed by the MLSysOps Framework. The client pod will be +relocated every 30 seconds, with round-robin logic to every worker node. + +`kubectl get pods -n mlsysops --context clusterID` diff --git a/docs/references/cli.md b/docs/references/cli.md new file mode 100644 index 0000000..a94ea54 --- /dev/null +++ b/docs/references/cli.md @@ -0,0 +1,42 @@ +The MLS CLI is a Python-based command-line tool designed for managing application deployments and system interactions +within the MLSysOps framework. It provides functionalities for deploying and managing applications, infrastructure +components, and machine learning (ML) models, as well as for querying system status and monitoring deployments. The CLI +communicates with the Northbound API, enabling efficient interaction with the MLSysOps framework. +Ke +y functionalities provided by the MLS CLI through the NB API include service health checks (ping), application +deployment, and retrieval of system, application, and ML model statuses. This tool streamlines deployment and management +workflows, offering an intuitive interface for interacting with the agent-based framework while ensuring efficient +system operations. + +The CLI is organized into distinct command groups, each responsible for managing a specific aspect of the system: +- apps: Manage application deployment, monitoring, and removal +- infra: Register and manage infrastructure components across the continuum +- ml: Handle the deployment and lifecycle of machine learning models +- manage: Perform general system operations, such as health checks and mode switching + +This structured CLI design ensures that different user roles can efficiently interact with the system based on their +specific needs, further reinforcing the modular and scalable nature of the MLSysOps framework. + +The table presents an overview of the CLI commands currently available. These commands are indicative and may be updated +or extended in the open-source release. + + +| **Group** | **Command** | **Description** | **Parameters** | +|------------------|---------------------------|--------------------------------------------------------|----------------------------------------------------------| +| **APP** | deploy-app | Deploy an application using a YAML file | YAML file using path or URI | +| | list-all | List the applications on the system | - | +| | get-app-status | Get the status of the application | App_id | +| | get-app-details | Get the details of an application | App_id | +| | get-app-performance | Get the performance metric of an application | App_id | +| | remove-app | Remove an application from the system | App_id | +| **INFRA** | register-infra | Register system description | YAML file using path or URI | +| | list | List infrastructure registered | infra_id (Datacenter or cluster ID) | +| | unregister-infra | Remove system description | infra_id | +| **Management** | /config set-mode | Change between ML or Heuristic-normal mode | 0 for Heuristic, 1 for ML | +| | Set System Target | Set infrastructure level targets | List of IDs and list of targets | +| | Config Trust | Configure trust assessment | List of node IDs, list of indexes, and list of weights | +| | Ping | Ping the continuum agent | - | +| **ML** | deploy-ml | Deploy an ML application using a YAML file | YAML file using path or URI | +| | list-all | List the ML models deployed on the system | - | +| | get-status | Get the status of the ML models | model_uid | +| | remove-ml | Remove an ML model from the system | model_uid | diff --git a/docs/references/ml-connector.md b/docs/references/ml-connector.md new file mode 100644 index 0000000..d128adf --- /dev/null +++ b/docs/references/ml-connector.md @@ -0,0 +1,807 @@ +# API Integration Documentation + +**Base URL:** `BASE_URL` + +--- + +# Model Endpoints +## Model Registration +Model registration is a two step process. In the initial step, we add the model metadata using json description defined below. For example, model type, hyperparameter, modeltags and other features. The second step involve adding the model artifacts; .pkl file, training data, requirements file and python script that will be used to retrain the model (See example). +### POST /model/add +**Summary:** Add new ML model metadata. + +**Request Body (`MLModelCreate`):** +```json +{ + "modelname": "RandomForest", + "modelkind": "classification", + "drift_detection": [ + { "is_true": 0, "method": 0 } + ] + // other fields (see endpoint): hyperparameter, modelperformance, trainingresource, runresource, featurelist, inference, modeltags +} +``` + +**Responses:** +- **201**: Created `MLModel` object. +- **422**: `HTTPValidationError`. + +**Example cURL:** +```bash +curl -X POST "BASE_URL/model/add" \ + -H "Content-Type: application/json" \ + -d '{ + "modelname": "MyModel", + "modelkind": "regression", + "drift_detection": [{"is_true": 1, "method": 2}] + }' +``` + +**Example Python:** +```python +import requests + +payload = { + "modelname": "MyModel", + "modelkind": "regression", + "drift_detection": [{"is_true": 1, "method": 2}] +} +resp = requests.post("BASE_URL/model/add", json=payload) +print(resp.json()) +``` + +```mermaid +sequenceDiagram + participant Agent + participant MLConnector + Agent->>MLConnector: POST /model/add + Note right of Agent: Body: MLModelCreate JSON + MLConnector-->>Agent: 201 Created +``` + +--- + +### POST /model/{model_id}/upload +**Summary:** Upload a file for a specific model. + +**Path Parameters:** + +| Name | In | Type | Required | Description | +|----------|------|--------|----------|----------------| +| model_id | path | string | yes | ID of the model | + +**Request Body (multipart/form-data):** +- `file` (binary) +- `file_kind`: `model` | `data` | `code` + +**Responses:** +- **201**: `FileSchema` object. +- **422**: `HTTPValidationError`. + +**Example cURL:** +```bash +curl -X POST "BASE_URL/model/1234/upload" \ + -F "file=@/path/to/model.pkl" \ + -F "file_kind=model" +``` + +**Example Python:** +```python +import requests + +files = { + "file": open("model.pkl", "rb"), + "file_kind": (None, "model") +} +resp = requests.post("BASE_URL/model/1234/upload", files=files) +print(resp.json()) +``` + +```mermaid +sequenceDiagram + participant Agent + participant MLConnector + Agent->>MLConnector: POST /model/{model_id}/upload + Note right of Agent: multipart/form-data (file, file_kind) + MLConnector-->>Agent: 201 Created +``` + +--- + +### GET /model/all +**Summary:** Get all ML models. + +**Query Parameters:** + +| Name | In | Type | Default | Required | Description | +|-------|-------|---------|---------|----------|-----------------------------| +| skip | query | integer | 0 | no | Number of items to skip | +| limit | query | integer | 100 | no | Maximum number of items | + +**Responses:** +- **200**: Array of `MLModel` objects. +- **422**: `HTTPValidationError`. + +**Example cURL:** +```bash +curl -X GET "BASE_URL/model/all?skip=0&limit=50" \ + -H "Accept: application/json" +``` + +**Example Python:** +```python +import requests + +resp = requests.get( + "BASE_URL/model/all", + params={"skip": 0, "limit": 50} +) +print(resp.json()) +``` + +```mermaid +sequenceDiagram + participant Agent + participant MLConnector + Agent->>MLConnector: GET /model/all?skip={skip}&limit={limit} + MLConnector-->>Agent: 200 OK +``` + +--- + +### GET /model/getkind/{modelkind} +**Summary:** Get models by kind. + +**Path Parameters:** + +| Name | In | Type | Required | Description | +|-----------|------|--------|----------|------------------------------------| +| modelkind | path | string | yes | `classification`, `regression`, or `clustering` | + +**Responses:** +- **200**: Array of `MLModel` objects. +- **422**: `HTTPValidationError`. + +**Example cURL:** +```bash +curl -X GET "BASE_URL/model/getkind/regression" \ + -H "Accept: application/json" +``` + +**Example Python:** +```python +import requests + +kind = "regression" +resp = requests.get(f"BASE_URL/model/getkind/{kind}") +print(resp.json()) +``` + +```mermaid +sequenceDiagram + participant Agent + participant MLConnector + Agent->>MLConnector: GET /model/getkind/{modelkind} + MLConnector-->>Agent: 200 OK +``` + +--- + +### GET /model/search +**Summary:** Get models by tags. + +**Query Parameters:** + +| Name | In | Type | Required | Description | +|------|-------|------------------|----------|---------------------------------| +| tags | query | array of strings | no | e.g. `?tags=fast&tags=tree-based` | + +**Responses:** +- **200**: Array of `MLModel` objects. +- **422**: `HTTPValidationError`. + +**Example cURL:** +```bash +curl -G "BASE_URL/model/search" \ + --data-urlencode "tags=fast" \ + --data-urlencode "tags=accuracy-focused" \ + -H "Accept: application/json" +``` + +**Example Python:** +```python +import requests + +params = [("tags", "fast"), ("tags", "accuracy-focused")] +resp = requests.get("BASE_URL/model/search", params=params) +print(resp.json()) +``` + +```mermaid +sequenceDiagram + participant Agent + participant MLConnector + Agent->>MLConnector: GET /model/search?tags=tag1&tags=tag2 + MLConnector-->>Agent: 200 OK +``` + +--- + +### PATCH /model/{model_id} +**Summary:** Update metadata of an existing model. + +**Path Parameters:** + +| Name | In | Type | Required | Description | +|----------|------|--------|----------|----------------| +| model_id | path | string | yes | ID of the model | + +> _Note: Request body schema not defined in spec; typically a partial `MLModel` object._ + +**Responses:** +- **200**: (empty response) +- **422**: `HTTPValidationError`. + +**Example cURL:** +```bash +curl -X PATCH "BASE_URL/model/1234" \ + -H "Content-Type: application/json" \ + -d '{ + "modeltags": ["updated-tag"], + "drift_detection": [{"is_true": 1, "method": 1}] + }' +``` + +**Example Python:** +```python +import requests + +update = { + "modeltags": ["updated-tag"], + "drift_detection": [{"is_true": 1, "method": 1}] +} +resp = requests.patch("BASE_URL/model/1234", json=update) +print(resp.status_code) +``` + +```mermaid +sequenceDiagram + participant Agent + participant MLConnector + Agent->>MLConnector: PATCH /model/{model_id} + Note right of Agent: Body: partial MLModel JSON + MLConnector-->>Agent: 200 OK +``` + +--- + +### DELETE /model/{model_id} +**Summary:** Delete an existing model. + +**Path Parameters:** + +| Name | In | Type | Required | Description | +|----------|------|--------|----------|----------------| +| model_id | path | string | yes | ID of the model | + +**Responses:** +- **200**: (empty response) +- **422**: `HTTPValidationError`. + +**Example cURL:** +```bash +curl -X DELETE "BASE_URL/model/1234" +``` + +**Example Python:** +```python +import requests + +resp = requests.delete("BASE_URL/model/1234") +print(resp.status_code) +``` + +```mermaid +sequenceDiagram + participant Agent + participant MLConnector + Agent->>MLConnector: DELETE /model/{model_id} + MLConnector-->>Agent: 200 OK +``` + +--- + +## Training Endpoints + +### POST /mltraining/add +**Summary:** Initiate model training. + +**Request Body (`MLTrainCreate`):** +```json +{ + "modelid": "1234", + "placement": { + "clusterID": "*", + "node": "*", + "continuum": false + } +} +``` + +**Responses:** +- **201**: `MLTrain` object. +- **422**: `HTTPValidationError`. + +**Example cURL:** +```bash +curl -X POST "BASE_URL/mltraining/add" \ + -H "Content-Type: application/json" \ + -d '{ + "modelid": "1234", + "placement": { "clusterID": "*", "node": "*", "continuum": false } + }' +``` + +**Example Python:** +```python +import requests + +payload = { + "modelid": "1234", + "placement": {"clusterID": "*", "node": "*", "continuum": False} +} +resp = requests.post("BASE_URL/mltraining/add", json=payload) +print(resp.json()) +``` + +```mermaid +sequenceDiagram + participant Agent + participant MLConnector + Agent->>MLConnector: POST /mltraining/add + Note right of Agent: Body: MLTrainCreate JSON + MLConnector-->>Agent: 201 Created +``` + +--- + +## Deployment Endpoints + +### GET /deployment/all +**Summary:** Get all deployments. + +**Query Parameters:** + +| Name | In | Type | Default | Required | Description | +|-------|-------|---------|---------|----------|-----------------------------| +| skip | query | integer | 0 | no | Number of items to skip | +| limit | query | integer | 100 | no | Maximum number of items | + +**Responses:** +- **200**: Array of deployment objects. +- **422**: `HTTPValidationError`. + +**Example cURL:** +```bash +curl -X GET "BASE_URL/deployment/all?skip=0&limit=50" \ + -H "Accept: application/json" +``` + +**Example Python:** +```python +import requests + +resp = requests.get( + "BASE_URL/deployment/all", + params={"skip": 0, "limit": 50} +) +print(resp.json()) +``` + +```mermaid +sequenceDiagram + participant Agent + participant MLConnector + Agent->>MLConnector: GET /deployment/all?skip={skip}&limit={limit} + MLConnector-->>Agent: 200 OK +``` + +--- + +### POST /deployment/add +**Summary:** Create a new deployment. + +**Request Body (`MLDeploymentCreate`):** +```json +{ + "modelid": "1234", + "ownerid": "agent-1", + "placement": { "clusterID": "*", "node": "*", "continuum": true }, + "deployment_id": "dep-5678", + "inference_data": 1 +} +``` + +**Responses:** +- **201**: `MLDeploymentReturn` object. +- **422**: `HTTPValidationError`. + +**Example cURL:** +```bash +curl -X POST "BASE_URL/deployment/add" \ + -H "Content-Type: application/json" \ + -d '{ + "modelid": "1234", + "ownerid": "agent-1", + "placement": { "clusterID": "*", "node": "*", "continuum": true }, + "deployment_id": "dep-5678", + "inference_data": 1 + }' +``` + +**Example Python:** +```python +import requests + +payload = { + "modelid": "1234", + "ownerid": "agent-1", + "placement": {"clusterID": "*", "node": "*", "continuum": True}, + "deployment_id": "dep-5678", + "inference_data": 1 +} +resp = requests.post("BASE_URL/deployment/add", json=payload) +print(resp.json()) +``` + +```mermaid +sequenceDiagram + participant Agent + participant MLConnector + Agent->>MLConnector: POST /deployment/add + Note right of Agent: Body: MLDeploymentCreate JSON + MLConnector-->>Agent: 201 Created +``` + +--- + +### POST /deployment/add/operation +**Summary:** Record an inference operation. + +**Request Body (`MLDeploymentOposCreate`):** +```json +{ + "ownerid": "agent-1", + "deploymentid": "dep-5678", + "modelid": "1234", + "data": "{...}", + "result": "{...}" +} +``` + +**Responses:** +- **201**: `MLDeploymentOposReturn` object. +- **422**: `HTTPValidationError`. + +**Example cURL:** +```bash +curl -X POST "BASE_URL/deployment/add/operation" \ + -H "Content-Type: application/json" \ + -d '{ + "ownerid": "agent-1", + "deploymentid": "dep-5678", + "modelid": "1234", + "data": "{...}", + "result": "{...}" + }' +``` + +**Example Python:** +```python +import requests + +payload = { + "ownerid": "agent-1", + "deploymentid": "dep-5678", + "modelid": "1234", + "data": "{...}", + "result": "{...}" +} +resp = requests.post("BASE_URL/deployment/add/operation", json=payload) +print(resp.json()) +``` + +```mermaid +sequenceDiagram + participant Agent + participant MLConnector + Agent->>MLConnector: POST /deployment/add/operation + Note right of Agent: Body: MLDeploymentOposCreate JSON + MLConnector-->>Agent: 201 Created +``` + +--- + +### GET /deployment/get/status/{deployment_id} +**Summary:** Retrieve deployment status. + +**Path Parameters:** + +| Name | In | Type | Required | Description | +|---------------|------|--------|----------|------------------------| +| deployment_id | path | string | yes | ID of the deployment | + +**Responses:** +- **200**: Status object. +- **422**: `HTTPValidationError`. + +**Example cURL:** +```bash +curl -X GET "BASE_URL/deployment/get/status/dep-5678" \ + -H "Accept: application/json" +``` + +**Example Python:** +```python +import requests + +resp = requests.get("BASE_URL/deployment/get/status/dep-5678") +print(resp.json()) +``` + +```mermaid +sequenceDiagram + participant Agent + participant MLConnector + Agent->>MLConnector: GET /deployment/get/status/{deployment_id} + MLConnector-->>Agent: 200 OK +``` + +--- + +### GET /deployment/get/opos/{ownerid} +**Summary:** List operations by owner. + +**Path Parameters:** + +| Name | In | Type | Required | Description | +|---------|------|--------|----------|------------------------| +| ownerid | path | string | yes | ID of the operation's owner | + +**Responses:** +- **200**: Array of `MLDeploymentOposReturn` objects. +- **422**: `HTTPValidationError`. + +**Example cURL:** +```bash +curl -X GET "BASE_URL/deployment/get/opos/agent-1" \ + -H "Accept: application/json" +``` + +**Example Python:** +```python +import requests + +resp = requests.get("BASE_URL/deployment/get/opos/agent-1") +print(resp.json()) +``` + +```mermaid +sequenceDiagram + participant Agent + participant MLConnector + Agent->>MLConnector: GET /deployment/get/opos/{ownerid} + MLConnector-->>Agent: 200 OK +``` + +--- + +### DELETE /deployment/{deployment_id} +**Summary:** Delete a deployment. + +**Path Parameters:** + +| Name | In | Type | Required | Description | +|---------------|------|--------|----------|------------------------| +| deployment_id | path | string | yes | ID of the deployment | + +**Responses:** +- **200**: (empty response) +- **422**: `HTTPValidationError`. + +**Example cURL:** +```bash +curl -X DELETE "BASE_URL/deployment/dep-5678" +``` + +**Example Python:** +```python +import requests + +resp = requests.delete("BASE_URL/deployment/dep-5678") +print(resp.status_code) +``` + +```mermaid +sequenceDiagram + participant Agent + participant MLConnector + Agent->>MLConnector: DELETE /deployment/{deployment_id} + MLConnector-->>Agent: 200 OK +``` + + +--- + +# End-to-end example + +**Base URL:** `BASE_URL` + +## 1. Build and save model +Below, we build a simple regression model using scikit-learn and save it to local storage. + +```python +... +# Replace with your training pipleline +reg = Ridge(alpha=1.0, random_state=0) +reg.fit(X, y) +... + +# It is important that all models are saved with a .pkl extension +# Serialize with pickle to a .pkl file +output_path = "diabetes_ridge.pkl" +with open(output_path, "wb") as f: + pickle.dump(reg, f) + +``` +## 2. Register ML model with +### 2.1 Model metadata +To register the model above, first we add the model metadata and then the model artfacts. Using the model above, here is json description example (To see what each parameter means see api documentation). +```json +{ + "modelname": "Ridge", + "modelkind": "Regressor", + "hyperparameter": [ + { + "parameter": "string", + "value": 0 + } + ], + "modelperformance": [ + { + "metric": "Accuracy", + "order": 1, + "threshold": 0.89 + } + ], + "trainingresource": [ + { + "resource_name": "GPU", + "value": 16, + "deploy": "string" + } + ], + "runresource": [ + { + "resource_name": "GPU", + "value": 16, + "deploy": "string" + } + ], + "featurelist": [...], + "inference": [ + { + "type": "string", + "value": "string" + } + ], +"modeltags": [ + "regression", + "fast" + ], +"drift_detection": [ + { + "is_true": 1, + "method": 0 + } + ] +} +``` +Use the above description, we can then make a post request to register the model. + +```python +import requests +resp = requests.post("BASE_URL/model/add", json=payload) +print(resp.json()) +``` +### 2.2 Model artifacts +The above step should return a model_id that will be used in the next steps. Here, will upload the model artifacts. These include; +- Model file (pickled file saved in step one above) +- Training data. This will be used for explainability and drift detection. (Note, it has to be the exact same data used to train the model, otherwise you will get wrong results) +- Requirements file that defines the environment the model was trained in. + +Upload these one by one using the example bellow; +Note: file_kind can be `model`, `data`, `code`, and `env` +```python +import requests + +files = { + "file": open("model.pkl", "rb"), + "file_kind": (None, "model") +} +resp = requests.post("BASE_URL/model/1234/upload", files=files) +print(resp.json()) +``` +## 3. Deployment +After adding the model artifacts, the next step is to deploy the model. The ML model is deployed as standalone docker application and an endpoint is returned to which inference data can be passed. +```python +import requests + +payload = { + "modelid": "1234", + "ownerid": "agent-1", + "placement": {..}, + "deployment_id": "", + "inference_data": 1 +} +resp = requests.post("BASE_URL/deployment/add", json=payload) +print(resp.json()) +``` +`placement` can one of the following; +- Placement to a specific cluster, node and continuum +```json +{"clusterID": "UTH-Internal-testbed", "node": "mls-drone", "continuum": "Edge"} +``` +- Placement on a given cluster +```json + {"clusterID": "UTH-Internal-testbed", "node": "*", "continuum": "*"} +``` +- Placement anywhere +```json +{"clusterID": "*", "node": "*", "continuum": "*"} +``` +This returns a deployment_id used to query the status of the deployment and also the inference endpoint and explainability. + +### 3.1 Query Deployment Status + +- **List All**: `GET /deployment/all?skip={skip}&limit={limit}` +- **Get Status**: `GET /deployment/get/status/{deployment_id}` + +**Example:** +```bash +curl -X GET "BASE_URL/deployment/get/status/dep-iris-001" +``` +--- + +## 4. Inference Endpoint (including Explainability) + +### 4.1 Predict Call + +Assuming deployment created with `deployment_id = dep-iris-001`: + +```bash +curl -X POST "BASE_URL/deployment/dep-iris-001/predict" \ + -H "Content-Type: application/json" \ + -d '{ + "data": [[5.1, 3.5, 1.4, 0.2]], + "explain": true + }' +``` + +**Response:** +```json +{ + "prediction": [0], + "explanation": { + "feature_importance": [0.12, 0.08, 0.70, 0.10], + "method": "shap" + } +} +``` + +### 4.2 Explainability Details + +- When `explain=true`, response includes per-feature contributions (e.g., SHAP values). +- Interpretation: Positive values push toward the predicted class; negatives push away. + +--- \ No newline at end of file diff --git a/docs/references/northbound-api.md b/docs/references/northbound-api.md new file mode 100644 index 0000000..dac2954 --- /dev/null +++ b/docs/references/northbound-api.md @@ -0,0 +1,47 @@ +The Northbound API (NB API) serves as the main interface for external systems—such as user interfaces and automation +tools—to interact with the MLSysOps agent-based orchestration framework. Designed as an HTTP-based RESTful service, it +enables users to send commands, retrieve information, and monitor overall system status. +The NB API operates on a predefined IP and port, supporting secure, asynchronous communication with the Continuum Agent. +This design allows for a modular and scalable control layer, effectively abstracting the internal complexity of the +multi-agent system. As a result, it offers a clean, service-oriented interface for seamless integration with external +management tools. + +To ensure clarity and maintainability, the NB API is structured into four main categories, each aligned with a specific +operational domain of the system. This modular organization reflects the core responsibilities and lifecycle stages of +the MLSysOps framework, facilitating consistent and intuitive interaction for all users and systems. + +**Applications**: Manage the lifecycle of deployed applications—from deployment to monitoring and removal. + +| **Method** | **Endpoint** | **Description** | +|------------|-----------------------------------|--------------------------------------------------------------------------------------| +| POST | /apps/deploy | Deploy an application. Requires app description in request body. | +| GET | /apps/list_all/ | Retrieve a list of all deployed applications in the framework. | +| GET | /apps/status/{app_id} | Get the current status of a specific application. | +| GET | /apps/apps/details/{app_id} | Fetch detailed metadata of an application. | +| GET | /apps/performance/{app_id} | Access performance metrics of a deployed application. | +| DELETE | /apps/remove/{app_id} | Remove (undeploy) a specific application. | + +ML Models: Control the deployment and lifecycle of machine learning models integrated into the system. + +| **Method** | **Endpoint** | **Description** | +|------------|-------------------------------|--------------------------------------------------------------| +| POST | /ml/deploy_ml | Deploy a machine learning model to the infrastructure. | +| GET | /ml/list_all/ | List all currently deployed ML models. | +| GET | /ml/status/{model_uid} | Check the status of deployment of an ML model. | +| DELETE | /ml/remove/{model_uid} | Remove an ML model from the system. | + +**Infrastructure**: Register, list, and manage edge, cluster, and datacenter components that make up the continuum. + +| **Method** | **Endpoint** | **Description** | +|------------|-------------------------------------|---------------------------------------------------------------| +| POST | /infra/register | Register infrastructure components (edge node, cluster, etc.). | +| GET | /infra/list/ | List all registered infrastructure components. | +| DELETE | /infra/unregister/{infra_id} | Unregister and remove an infrastructure component. | + +**Management**: System-level controls for health checks and operational mode switching. + +| **Method** | **Endpoint** | **Description** | +|------------|-------------------------------|---------------------------------------------------------------| +| GET | /manage/ping | Check continuum agent status (ping the continuum agent). | +| PUT | /manage/mode/{mode} | Change operational mode of the Agent (Heuristic or ML). |). + diff --git a/docs/references/telemetrysdk.md b/docs/references/telemetrysdk.md new file mode 100755 index 0000000..75f5e03 --- /dev/null +++ b/docs/references/telemetrysdk.md @@ -0,0 +1,15 @@ +# MLSysOps Telemetry SDK & API + +The MLSysOps framework offers integrators two pathways to interface with the telemetry system: i) use the OpenTelemetry +SDK and its respective API or ii) employ the MLSysOps Telemetry SDK, which provides a simplified API. The former +provides SDKs for a wide range of programming languages, enabling both manual instrumentation and automated +instrumentation on compatible software systems. The latter serves as a wrapper on top of the OpenTelemetry SDK, +abstracting away all the mundane code required to connect to an OpenTelemetry Collector and push metrics. This +abstraction uses two function calls: one for pushing metrics and one for retrieving metrics. Instrumenting application +components within the MLSysOps Framework is achievable with either of these options. MLSysOps Telemetry API/SDK is +implemented in Python language and is available in the opensource +repository [MLSysOps Python Telemetry Library](https://github.com/mlsysops-eu/MLSysOps-Python-Telemetry-Library). + +## API Reference + +TBD \ No newline at end of file diff --git a/docs/stylesheets/extra.css b/docs/stylesheets/extra.css old mode 100644 new mode 100755 diff --git a/docs/testbed.md b/docs/testbed.md old mode 100644 new mode 100755 diff --git a/docs/tutorials/index.md b/docs/tutorials/index.md old mode 100644 new mode 100755 diff --git a/mkdocs.yml b/mkdocs.yml old mode 100644 new mode 100755 index ceb938d..cdb1a60 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -site_name: MLSysOps framework +site_name: MLSysOps Framework site_url: https://mlsysops-eu.github.io/mlsysops-framework site_author: MLSysOps Consortium @@ -77,6 +77,7 @@ theme: - navigation.tracking - navigation.top - navigation.footer + - navigation.sections - toc.follow - content.code.copy - content.tabs.link