diff --git a/simplyblock_cli/cli-reference.yaml b/simplyblock_cli/cli-reference.yaml index d5cad51c0..c0d08a263 100644 --- a/simplyblock_cli/cli-reference.yaml +++ b/simplyblock_cli/cli-reference.yaml @@ -1899,6 +1899,18 @@ commands: dest: resize type: size default: "0" + - name: backup + help: "create backup task for snapshot" + arguments: + - name: "snapshot_id" + help: "Snapshot id" + dest: snapshot_id + type: str + - name: "--delete-after-finish" + help: "Delete the snapshot after backup is completed" + dest: delete_after_finish + type: bool + action: store_true - name: "qos" help: "qos commands" weight: 700 @@ -1947,3 +1959,48 @@ commands: type: str required: false default: "" + - name: "backup" + help: "FDB Backup operations" + weight: 800 + subcommands: + - name: create + help: "Creates an fdb backup" + arguments: + - name: "cluster_id" + help: "Cluster ID to create db backup for" + dest: cluster_id + type: str + - name: list + help: "Lists all fdb backups" + - name: status + help: "get backup status" + - name: restore + help: "restore a backup" + arguments: + - name: "name" + help: "backup class name" + dest: name + type: str + - name: config + help: "Set backup configuration" + arguments: + - name: "--backup-path" + help: 'local backup path, defaults to /etc/foundationdb/backup' + dest: backup_path + type: str + - name: "--backup-frequency" + help: "backup frequency, can be 3h, 1d" + dest: backup_frequency + type: str + - name: "--s3-bucket" + help: 'AWS S3 bucket name' + dest: bucket_name + type: str + - name: "--s3-region" + help: 'AWS S3 region' + dest: region_name + type: str + - name: "--s3-credentials" + help: 'AWS S3 API key and secret, should be supplied like this: [API_KEY]:[API_SECRET]' + dest: backup_credentials + type: str \ No newline at end of file diff --git a/simplyblock_cli/cli.py b/simplyblock_cli/cli.py index 2bfd792ec..a3e778fc1 100644 --- a/simplyblock_cli/cli.py +++ b/simplyblock_cli/cli.py @@ -28,6 +28,7 @@ def __init__(self): self.init_storage_pool() self.init_snapshot() self.init_qos() + self.init_backup() super().__init__() def init_storage_node(self): @@ -748,6 +749,7 @@ def init_snapshot(self): self.init_snapshot__list(subparser) self.init_snapshot__delete(subparser) self.init_snapshot__clone(subparser) + self.init_snapshot__backup(subparser) def init_snapshot__add(self, subparser): @@ -770,6 +772,11 @@ def init_snapshot__clone(self, subparser): subcommand.add_argument('lvol_name', help='Logical volume name', type=str) argument = subcommand.add_argument('--resize', help='New logical volume size: 10M, 10G, 10(bytes). Can only increase.', type=size_type(), default='0', dest='resize') + def init_snapshot__backup(self, subparser): + subcommand = self.add_sub_command(subparser, 'backup', 'create backup task for snapshot') + subcommand.add_argument('snapshot_id', help='Snapshot id', type=str) + argument = subcommand.add_argument('--delete-after-finish', help='Delete the snapshot after backup is completed', dest='delete_after_finish', action='store_true') + def init_qos(self): subparser = self.add_command('qos', 'qos commands') @@ -795,6 +802,38 @@ def init_qos__delete(self, subparser): subcommand.add_argument('cluster_id', help='Cluster UUID', type=str, default='') + def init_backup(self): + subparser = self.add_command('backup', 'FDB Backup operations') + self.init_backup__create(subparser) + self.init_backup__list(subparser) + self.init_backup__status(subparser) + self.init_backup__restore(subparser) + self.init_backup__config(subparser) + + + def init_backup__create(self, subparser): + subcommand = self.add_sub_command(subparser, 'create', 'Creates an fdb backup') + subcommand.add_argument('cluster_id', help='Cluster ID to create db backup for', type=str) + + def init_backup__list(self, subparser): + subcommand = self.add_sub_command(subparser, 'list', 'Lists all fdb backups') + + def init_backup__status(self, subparser): + subcommand = self.add_sub_command(subparser, 'status', 'get backup status') + + def init_backup__restore(self, subparser): + subcommand = self.add_sub_command(subparser, 'restore', 'restore a backup') + subcommand.add_argument('name', help='backup class name', type=str) + + def init_backup__config(self, subparser): + subcommand = self.add_sub_command(subparser, 'config', 'Set backup configuration') + argument = subcommand.add_argument('--backup-path', help='local backup path, defaults to /etc/foundationdb/backup', type=str, dest='backup_path') + argument = subcommand.add_argument('--backup-frequency', help='backup frequency, can be 3h, 1d', type=str, dest='backup_frequency') + argument = subcommand.add_argument('--s3-bucket', help='AWS S3 bucket name', type=str, dest='bucket_name') + argument = subcommand.add_argument('--s3-region', help='AWS S3 region', type=str, dest='region_name') + argument = subcommand.add_argument('--s3-credentials', help='AWS S3 API key and secret, should be supplied like this: [API_KEY]:[API_SECRET]', type=str, dest='backup_credentials') + + def run(self): args = self.parser.parse_args() if args.debug: @@ -1113,6 +1152,8 @@ def run(self): ret = self.snapshot__delete(sub_command, args) elif sub_command in ['clone']: ret = self.snapshot__clone(sub_command, args) + elif sub_command in ['backup']: + ret = self.snapshot__backup(sub_command, args) else: self.parser.print_help() @@ -1127,6 +1168,21 @@ def run(self): else: self.parser.print_help() + elif args.command in ['backup']: + sub_command = args_dict['backup'] + if sub_command in ['create']: + ret = self.backup__create(sub_command, args) + elif sub_command in ['list']: + ret = self.backup__list(sub_command, args) + elif sub_command in ['status']: + ret = self.backup__status(sub_command, args) + elif sub_command in ['restore']: + ret = self.backup__restore(sub_command, args) + elif sub_command in ['config']: + ret = self.backup__config(sub_command, args) + else: + self.parser.print_help() + else: self.parser.print_help() diff --git a/simplyblock_cli/clibase.py b/simplyblock_cli/clibase.py index 277984c6a..ed9168f16 100644 --- a/simplyblock_cli/clibase.py +++ b/simplyblock_cli/clibase.py @@ -12,7 +12,7 @@ from simplyblock_core import storage_node_ops as storage_ops from simplyblock_core import mgmt_node_ops as mgmt_ops from simplyblock_core.controllers import pool_controller, lvol_controller, snapshot_controller, device_controller, \ - tasks_controller, qos_controller + tasks_controller, qos_controller, backup_controller from simplyblock_core.controllers import health_controller from simplyblock_core.models.pool import Pool from simplyblock_core.models.cluster import Cluster @@ -657,6 +657,9 @@ def snapshot__clone(self, sub_command, args): success, details = snapshot_controller.clone(args.snapshot_id, args.lvol_name, new_size) return details + def snapshot__backup(self, sub_command, args): + return snapshot_controller.create_backup(args.snapshot_id, args.delete_after_finish) + def qos__add(self, sub_command, args): return qos_controller.add_class(args.name, args.weight, args.cluster_id) @@ -666,6 +669,21 @@ def qos__list(self, sub_command, args): def qos__delete(self, sub_command, args): return qos_controller.delete_class(args.name, args.cluster_id) + def backup__create(self, sub_command, args): + return tasks_controller.add_backup_task(args.cluster_id) + + def backup__list(self, sub_command, args): + return backup_controller.list_backups() + + def backup__status(self, sub_command, args): + return backup_controller.backup_status() + + def backup__restore(self, sub_command, args): + return backup_controller.backup_restore(args.name) + + def backup__config(self, sub_command, args): + return backup_controller.backup_configure(args.backup_path, args.backup_frequency, args.bucket_name, args.region_name, args.backup_credentials) + def storage_node_list_devices(self, args): node_id = args.node_id is_json = args.json diff --git a/simplyblock_core/cluster_ops.py b/simplyblock_core/cluster_ops.py index ceff1374d..bf1c3ca00 100644 --- a/simplyblock_core/cluster_ops.py +++ b/simplyblock_core/cluster_ops.py @@ -1193,6 +1193,13 @@ def update_cluster(cluster_id, mgmt_only=False, restart=False, spdk_image=None, service_file="python simplyblock_core/services/tasks_runner_jc_comp.py", service_image=service_image) + if "app_BackupService" not in service_names: + utils.create_docker_service( + cluster_docker=cluster_docker, + service_name="app_BackupService", + service_file="python simplyblock_core/services/tasks_runner_fdb_backup.py", + service_image=service_image) + logger.info("Done updating mgmt cluster") elif cluster.mode == "kubernetes": @@ -1239,6 +1246,14 @@ def update_cluster(cluster_id, mgmt_only=False, restart=False, spdk_image=None, service_file="simplyblock_core/services/snapshot_monitor.py", container_image=service_image) + if "simplyblock-backup-service" not in deployment_names: + utils.create_k8s_service( + namespace=namespace, + deployment_name="simplyblock-backup-service", + container_name="snapshot-monitor", + service_file="simplyblock_core/services/tasks_runner_fdb_backup.py", + container_image=service_image) + # Update DaemonSets daemonsets = apps_v1.list_namespaced_daemon_set(namespace=namespace) for ds in daemonsets.items: diff --git a/simplyblock_core/constants.py b/simplyblock_core/constants.py index b9921d6ac..b8c05346d 100644 --- a/simplyblock_core/constants.py +++ b/simplyblock_core/constants.py @@ -26,6 +26,7 @@ def get_config_var(name, default=None): KVD_DB_VERSION = 730 KVD_DB_FILE_PATH = os.getenv('FDB_CLUSTER_FILE', '/etc/foundationdb/fdb.cluster') KVD_DB_TIMEOUT_MS = 10000 +KVD_DB_BACKUP_PATH = "file:///etc/foundationdb/backup" SPK_DIR = '/home/ec2-user/spdk' LOG_LEVEL = logging.INFO LOG_WEB_LEVEL = logging.DEBUG diff --git a/simplyblock_core/controllers/backup_controller.py b/simplyblock_core/controllers/backup_controller.py new file mode 100644 index 000000000..470d39511 --- /dev/null +++ b/simplyblock_core/controllers/backup_controller.py @@ -0,0 +1,177 @@ +# coding=utf-8 +import datetime +import logging as lg +import re +import time +import uuid + +import docker + +from simplyblock_core import utils, constants +from simplyblock_core.db_controller import DBController +from simplyblock_core.models.job_schedule import JobSchedule +from simplyblock_core.models.snapshot import SnapShot + +logger = lg.getLogger() +db_controller = DBController() +# backup_path = constants.KVD_DB_BACKUP_PATH +clusters = db_controller.get_clusters() +if clusters: + cluster = clusters[0] + + +def __get_fdb_cont(): + snode = db_controller.get_mgmt_nodes()[0] + if not snode: + return + node_docker = docker.DockerClient(base_url=f"tcp://{snode.docker_ip_port}", version="auto") + for container in node_docker.containers.list(): + if container.name.startswith("app_fdb-server"): # type: ignore[union-attr] + return container + +def create_backup(): + container = __get_fdb_cont() + if container: + backup_path = cluster.backup_local_path + if cluster.backup_s3_bucket and cluster.backup_s3_cred: + folder = f"backup-{str(datetime.datetime.now())}" + folder = folder.replace(" ", "-") + folder = folder.replace(":", "-") + folder = folder.split(".")[0] + backup_path = f"blobstore://{cluster.backup_s3_cred}@s3.{cluster.backup_s3_region}.amazonaws.com/{folder}?bucket={cluster.backup_s3_bucket}®ion={cluster.backup_s3_region}&sc=0" + + res = container.exec_run(cmd=f"fdbbackup start -d {backup_path} -w") + cont = res.output.decode("utf-8") + logger.info(cont) + return True + return False + +def list_backups(): + container = __get_fdb_cont() + data = [] + if container: + backup_path = cluster.get_backup_path() + res = container.exec_run(cmd=f"fdbbackup list -b {backup_path}") + logger.info(f"backup list from : {backup_path}") + cont = res.output.decode("utf-8") + for line in cont.splitlines(): + if not line or "backup-" not in line: + continue + + name = line.split("/")[-1].strip() + name = name.split("?")[0] + size = 0 + restorable = 0 + date = "" + version = 0 + res = container.exec_run(cmd=f"fdbbackup describe -d {cluster.get_backup_path(name)} --version-timestamps") + cont = res.output.decode("utf-8") + for line in cont.splitlines(): + if line and line.startswith("SnapshotBytes"): + size = line.split()[1].strip() + if line and line.startswith("Restorable"): + restorable = line.split()[1].strip() + if line and line.startswith("Snapshot:"): + for param in line.split(): + if param.startswith("startVersion"): + version = param.split("=")[1].strip() + elif param.startswith("(") and param.endswith(")") and not date:# 2025/12/28.10:10:20+0000 + try: + date = datetime.datetime.strptime(param[1:-1], "%Y/%m/%d.%H:%M:%S+0000").strftime("%Y-%m-%d %H:%M:%S") + except Exception: + date = name.replace("backup-","") + if not date: + date = name.replace("backup-", "") + + data.append({ + "Name": name, + "Version": version, + "Size": utils.humanbytes(int(size)), + "Restorable": restorable, + "Date": date, + }) + + return utils.print_table(data) + + return True + + + +def backup_status(): + container = __get_fdb_cont() + if container: + res = container.exec_run(cmd="fdbbackup status") + cont = res.output.decode("utf-8") + logger.info(f"backup status: \n{cont.strip()}") + return True + + +def backup_restore(backup_name): + container = __get_fdb_cont() + if container: + backup_path = cluster.get_backup_path(backup_name) + res = container.exec_run(cmd="fdbcli --exec \"writemode on; clearrange \\\"\\\" \\xff\"") + cont = res.output.decode("utf-8") + logger.info(cont.strip()) + res = container.exec_run(cmd=f"fdbrestore start -r \"{backup_path}\" --dest-cluster-file {constants.KVD_DB_FILE_PATH}") + cont = res.output.decode("utf-8") + logger.info(cont.strip()) + return True + + +def parse_history_param(history_string): + if not history_string: + logger.error("Invalid history value") + return False + results = re.search(r'^(\d+[hmd])(\d+[hmd])?$', history_string.lower()) + if not results: + logger.error(f"Error parsing history string: {history_string}") + return False + total_seconds = 0 + for s in results.groups(): + if not s: + continue + ind = s[-1] + v = int(s[:-1]) + if ind == 'd': + total_seconds += v*60*60*24 + if ind == 'h': + total_seconds += v*60*60 + if ind == 'm': + total_seconds += v*60 + return int(total_seconds) + + +def backup_configure(backup_path, backup_frequency, bucket_name, region_name, backup_credentials): + clusters = db_controller.get_clusters() + if clusters: + if backup_path: + if not backup_path.startswith("file://"): + backup_path = f"file://{backup_path}" + clusters[0].backup_local_path = backup_path + if backup_frequency: + total_seconds = parse_history_param(backup_frequency) + clusters[0].backup_frequency_seconds = total_seconds + clusters[0].backup_s3_region = region_name if region_name else "" + clusters[0].backup_s3_bucket = bucket_name if bucket_name else "" + clusters[0].backup_s3_cred = backup_credentials if backup_credentials else "" + clusters[0].write_to_db() + return True + + +def create_snapshot_backup(snapshot: SnapShot): + container = __get_fdb_cont() + if container: + backup_path = cluster.backup_local_path + if cluster.backup_s3_bucket and cluster.backup_s3_cred: + folder = f"backup-{str(datetime.datetime.now())}" + folder = folder.replace(" ", "-") + folder = folder.replace(":", "-") + folder = folder.split(".")[0] + backup_path = f"blobstore://{cluster.backup_s3_cred}@s3.{cluster.backup_s3_region}.amazonaws.com/{folder}?bucket={cluster.backup_s3_bucket}®ion={cluster.backup_s3_region}&sc=0" + + res = container.exec_run(cmd=f"fdbbackup start -d {backup_path} -w") + cont = res.output.decode("utf-8") + logger.info(cont) + return True + return False diff --git a/simplyblock_core/controllers/snapshot_controller.py b/simplyblock_core/controllers/snapshot_controller.py index d3eca0e00..24576f8ed 100644 --- a/simplyblock_core/controllers/snapshot_controller.py +++ b/simplyblock_core/controllers/snapshot_controller.py @@ -3,7 +3,7 @@ import time import uuid -from simplyblock_core.controllers import lvol_controller, snapshot_events, pool_controller +from simplyblock_core.controllers import lvol_controller, snapshot_events, pool_controller, tasks_controller from simplyblock_core import utils, constants from simplyblock_core.db_controller import DBController @@ -219,6 +219,10 @@ def add(lvol_id, snapshot_name): logger.info("Done") snapshot_events.snapshot_create(snap) + + if cluster.backup_s3_bucket and cluster.backup_s3_cred: + tasks_controller.add_snap_backup_task(snap) + return snap.uuid, False @@ -587,3 +591,20 @@ def clone(snapshot_id, clone_name, new_size=0, pvc_name=None, pvc_namespace=None if new_size: lvol_controller.resize_lvol(lvol.get_id(), new_size) return lvol.uuid, False + + +def create_backup(snapshot_uuid, delete_after_finish=False): + try: + snap = db_controller.get_snapshot_by_id(snapshot_uuid) + except KeyError: + logger.error(f"Snapshot not found {snapshot_uuid}") + return False + + cluster = db_controller.get_cluster_by_id(snap.cluster_id) + if cluster.backup_s3_bucket and cluster.backup_s3_cred: + tasks_controller.add_snap_backup_task(snap, delete_after_finish) + logger.info("Done") + return True + else: + logger.error("Cluster S3 backup is not configured") + return False diff --git a/simplyblock_core/controllers/tasks_controller.py b/simplyblock_core/controllers/tasks_controller.py index dab539943..11f568c20 100644 --- a/simplyblock_core/controllers/tasks_controller.py +++ b/simplyblock_core/controllers/tasks_controller.py @@ -9,6 +9,7 @@ from simplyblock_core.controllers import tasks_events, device_controller from simplyblock_core.models.cluster import Cluster from simplyblock_core.models.job_schedule import JobSchedule +from simplyblock_core.models.snapshot import SnapShot from simplyblock_core.models.storage_node import StorageNode logger = logging.getLogger() @@ -160,8 +161,7 @@ def list_tasks(cluster_id, is_json=False, limit=50, **kwargs): return False data = [] - tasks = db.get_job_tasks(cluster_id, reverse=True) - tasks.reverse() + tasks = db.get_job_tasks(cluster_id, reverse=False) if is_json is True: for t in tasks: if t.function_name == JobSchedule.FN_DEV_MIG: @@ -420,3 +420,72 @@ def get_lvol_sync_del_task(cluster_id, node_id, lvol_bdev_name=None): return task.uuid return False + +def add_backup_task(cluster_id): + try: + cluster = db.get_cluster_by_id(cluster_id) + except Exception as e: + logger.error(f"Failed to get cluster {cluster_id}: {e}") + return False + + tasks = get_backup_tasks(cluster_id) + for task in tasks: + if task.status != JobSchedule.STATUS_DONE: + logger.info(f"Backup task found: {tasks[0].uuid}") + return False + + task_obj = JobSchedule() + task_obj.uuid = str(uuid.uuid4()) + task_obj.cluster_id = cluster.get_id() + task_obj.date = int(time.time()) + task_obj.max_retry = constants.TASK_EXEC_RETRY_COUNT + task_obj.status = JobSchedule.STATUS_NEW + task_obj.function_name = JobSchedule.FN_FDB_BACKUP + task_obj.write_to_db() + logger.info(f"Backup task created: {task_obj.uuid}") + return task_obj.uuid + + +def get_backup_tasks(cluster_id): + backup_tasks = [] + tasks = db.get_job_tasks(cluster_id) + for task in tasks: + if task.function_name == JobSchedule.FN_FDB_BACKUP: + backup_tasks.append(task) + return backup_tasks + + +def add_snap_backup_task(snapshot: SnapShot, delete_after_finish: bool = False): + try: + cluster = db.get_cluster_by_id(snapshot.cluster_id) + except Exception as e: + logger.error(f"Failed to get cluster {snapshot.cluster_id}: {e}") + return False + + tasks = get_snap_backup_task(snapshot.cluster_id) + for task in tasks: + if task.status != JobSchedule.STATUS_DONE and task.function_params.get("snapshot_id") == snapshot.get_id(): + logger.info(f"Snapshot Backup task found: {task.uuid}") + return False + + task_obj = JobSchedule() + task_obj.uuid = str(uuid.uuid4()) + task_obj.cluster_id = cluster.get_id() + task_obj.date = int(time.time()) + task_obj.max_retry = constants.TASK_EXEC_RETRY_COUNT + task_obj.status = JobSchedule.STATUS_NEW + task_obj.function_name = JobSchedule.FN_SNAPSHOT_BACKUP + task_obj.function_params ={"snapshot_id": snapshot.get_id(), "delete_after_finish": delete_after_finish} + task_obj.write_to_db() + + logger.info(f"Backup task created: {task_obj.uuid}") + return task_obj.uuid + + +def get_snap_backup_task(cluster_id): + backup_tasks = [] + tasks = db.get_job_tasks(cluster_id) + for task in tasks: + if task.function_name == JobSchedule.FN_SNAPSHOT_BACKUP: + backup_tasks.append(task) + return backup_tasks diff --git a/simplyblock_core/env_var b/simplyblock_core/env_var index 027f0d179..ac68a1969 100644 --- a/simplyblock_core/env_var +++ b/simplyblock_core/env_var @@ -1,5 +1,5 @@ SIMPLY_BLOCK_COMMAND_NAME=sbcli-dev SIMPLY_BLOCK_VERSION=19.2.30 -SIMPLY_BLOCK_DOCKER_IMAGE=public.ecr.aws/simply-block/simplyblock:R25.10-Hotfix -SIMPLY_BLOCK_SPDK_ULTRA_IMAGE=public.ecr.aws/simply-block/ultra:R25.10-Hotfix-latest +SIMPLY_BLOCK_DOCKER_IMAGE=public.ecr.aws/simply-block/simplyblock:R25.10-Hotfix-fdb-backup +SIMPLY_BLOCK_SPDK_ULTRA_IMAGE=simplyblock/spdk:s3-backup-latest diff --git a/simplyblock_core/models/cluster.py b/simplyblock_core/models/cluster.py index 620309f77..02277feb0 100644 --- a/simplyblock_core/models/cluster.py +++ b/simplyblock_core/models/cluster.py @@ -1,7 +1,8 @@ # coding=utf-8 - +import os.path from typing import List +from simplyblock_core import constants from simplyblock_core.models.base_model import BaseModel @@ -69,6 +70,11 @@ class Cluster(BaseModel): is_re_balancing: bool = False full_page_unmap: bool = True is_single_node: bool = False + backup_local_path: str = constants.KVD_DB_BACKUP_PATH + backup_frequency_seconds: int = 3*60*60 + backup_s3_bucket: str = "" + backup_s3_region: str = "" + backup_s3_cred: str = "" def get_status_code(self): if self.status in self.STATUS_CODE_MAP: @@ -90,3 +96,10 @@ def is_qos_set(self) -> bool: return True return False + def get_backup_path(self, path=""): + if self.backup_s3_bucket and self.backup_s3_cred: + backup_path = f"blobstore://{self.backup_s3_cred}@s3.{self.backup_s3_region}.amazonaws.com/{path}?bucket={self.backup_s3_bucket}" \ + + f"®ion={self.backup_s3_region}&sc=0" + else: + backup_path = os.path.join(self.backup_local_path, path) + return backup_path diff --git a/simplyblock_core/models/job_schedule.py b/simplyblock_core/models/job_schedule.py index bbdcd7871..575ad705a 100644 --- a/simplyblock_core/models/job_schedule.py +++ b/simplyblock_core/models/job_schedule.py @@ -23,6 +23,8 @@ class JobSchedule(BaseModel): FN_BALANCING_AFTER_DEV_EXPANSION = "balancing_on_dev_add" FN_JC_COMP_RESUME = "jc_comp_resume" FN_LVOL_SYNC_DEL = "lvol_sync_del" + FN_FDB_BACKUP = "fdb_backup" + FN_SNAPSHOT_BACKUP = "snapshot_backup" canceled: bool = False cluster_id: str = "" diff --git a/simplyblock_core/rpc_client.py b/simplyblock_core/rpc_client.py index 293a76b8d..a13871bcb 100644 --- a/simplyblock_core/rpc_client.py +++ b/simplyblock_core/rpc_client.py @@ -1234,3 +1234,11 @@ def nvmf_port_unblock_rdma(self, port): def nvmf_get_blocked_ports_rdma(self): return self._request("nvmf_get_blocked_ports") + + def bdev_lvol_s3_backup(self, s3_id, snapshots): + params = { + "s3_id": s3_id, + "cluster_batch": 16, + "snapshots": snapshots + } + return self._request2("bdev_lvol_s3_backup", params) diff --git a/simplyblock_core/scripts/charts/templates/app_k8s.yaml b/simplyblock_core/scripts/charts/templates/app_k8s.yaml index ab11562d9..458afa322 100644 --- a/simplyblock_core/scripts/charts/templates/app_k8s.yaml +++ b/simplyblock_core/scripts/charts/templates/app_k8s.yaml @@ -1158,6 +1158,57 @@ spec: - key: cluster-file path: fdb.cluster --- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: simplyblock-backup-service + namespace: {{ .Release.Namespace }} +spec: + replicas: 1 + selector: + matchLabels: + app: simplyblock-backup-service + template: + metadata: + annotations: + log-collector/enabled: "true" + reloader.stakater.com/auto: "true" + reloader.stakater.com/configmap: "simplyblock-fdb-cluster-config" + labels: + app: simplyblock-backup-service + spec: + hostNetwork: true + dnsPolicy: ClusterFirstWithHostNet + containers: + - name: backup-service + image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}" + imagePullPolicy: "{{ .Values.image.simplyblock.pullPolicy }}" + command: ["python", "simplyblock_core/services/tasks_runner_fdb_backup.py"] + env: + - name: SIMPLYBLOCK_LOG_LEVEL + valueFrom: + configMapKeyRef: + name: simplyblock-config + key: LOG_LEVEL + volumeMounts: + - name: fdb-cluster-file + mountPath: /etc/foundationdb/fdb.cluster + subPath: fdb.cluster + resources: + requests: + cpu: "200m" + memory: "256Mi" + limits: + cpu: "400m" + memory: "1Gi" + volumes: + - name: fdb-cluster-file + configMap: + name: simplyblock-fdb-cluster-config + items: + - key: cluster-file + path: fdb.cluster +--- apiVersion: apps/v1 kind: DaemonSet diff --git a/simplyblock_core/scripts/docker-compose-swarm.yml b/simplyblock_core/scripts/docker-compose-swarm.yml index eb29d68b6..f6b5e579a 100644 --- a/simplyblock_core/scripts/docker-compose-swarm.yml +++ b/simplyblock_core/scripts/docker-compose-swarm.yml @@ -364,6 +364,20 @@ services: environment: SIMPLYBLOCK_LOG_LEVEL: "$LOG_LEVEL" + BackupService: + <<: *service-base + image: $SIMPLYBLOCK_DOCKER_IMAGE + command: "python simplyblock_core/services/tasks_runner_fdb_backup.py" + deploy: + placement: + constraints: [node.role == manager] + volumes: + - "/etc/foundationdb:/etc/foundationdb" + networks: + - hostnet + environment: + SIMPLYBLOCK_LOG_LEVEL: "$LOG_LEVEL" + networks: monitoring-net: external: true diff --git a/simplyblock_core/services/cap_monitor.py b/simplyblock_core/services/cap_monitor.py index c68049256..61a4beea9 100644 --- a/simplyblock_core/services/cap_monitor.py +++ b/simplyblock_core/services/cap_monitor.py @@ -4,9 +4,9 @@ from datetime import datetime, timezone from simplyblock_core import db_controller, constants, cluster_ops, utils -from simplyblock_core.controllers import cluster_events +from simplyblock_core.controllers import cluster_events, tasks_controller from simplyblock_core.models.cluster import Cluster - +from simplyblock_core.models.job_schedule import JobSchedule logger = utils.get_logger(__name__) @@ -14,10 +14,22 @@ db = db_controller.DBController() last_event: dict[str, dict] = {} + +def create_fdb_backup_if_needed(cluster): + last_backup_task = None + tasks = tasks_controller.get_backup_tasks(cluster.get_id()) + if tasks: + last_backup_task = tasks[0] + if last_backup_task and last_backup_task.status == JobSchedule.STATUS_DONE: + if last_backup_task.date + cluster.backup_frequency_seconds < time.time(): + tasks_controller.add_backup_task(cluster.get_id()) + + logger.info("Starting capacity monitoring service...") while True: clusters = db.get_clusters() for cl in clusters: + create_fdb_backup_if_needed(cl) logger.info(f"Checking cluster: {cl.get_id()}") records = db.get_cluster_capacity(cl, 1) if not records: diff --git a/simplyblock_core/services/tasks_runner_fdb_backup.py b/simplyblock_core/services/tasks_runner_fdb_backup.py new file mode 100644 index 000000000..6806682e3 --- /dev/null +++ b/simplyblock_core/services/tasks_runner_fdb_backup.py @@ -0,0 +1,114 @@ +# coding=utf-8 +import time + + +from simplyblock_core import db_controller, utils, constants +from simplyblock_core.controllers import backup_controller, tasks_controller +from simplyblock_core.models.job_schedule import JobSchedule +from simplyblock_core.models.cluster import Cluster +from simplyblock_core.models.storage_node import StorageNode + +logger = utils.get_logger(__name__) + +# get DB controller +db = db_controller.DBController() + + +def process_fdb_backup_task(task): + task = db.get_task_by_id(task.uuid) + if task.canceled: + task.function_result = "canceled" + task.status = JobSchedule.STATUS_DONE + task.write_to_db(db.kv_store) + return + + if task.retry >= task.max_retry: + task.function_result = "max retry reached, stopping task" + task.status = JobSchedule.STATUS_DONE + task.write_to_db(db.kv_store) + return + + if task.status != JobSchedule.STATUS_RUNNING: + task.status = JobSchedule.STATUS_RUNNING + task.write_to_db(db.kv_store) + + ret = backup_controller.create_backup() + if ret: + task.function_result = "Backup created" + task.status = JobSchedule.STATUS_DONE + task.write_to_db(db.kv_store) + + +def process_snap_backup_task(task): + + task = db.get_task_by_id(task.uuid) + + if task.canceled: + task.function_result = "canceled" + task.status = JobSchedule.STATUS_DONE + task.write_to_db(db.kv_store) + return + + node = db.get_storage_node_by_id(task.node_id) + + if not node: + task.function_result = "node not found" + task.status = JobSchedule.STATUS_DONE + task.write_to_db(db.kv_store) + return + + if node.status not in [StorageNode.STATUS_DOWN, StorageNode.STATUS_ONLINE]: + msg = f"Node is {node.status}, retry task" + logger.info(msg) + task.function_result = msg + task.status = JobSchedule.STATUS_SUSPENDED + task.write_to_db(db.kv_store) + return + + if task.status != JobSchedule.STATUS_RUNNING: + task.status = JobSchedule.STATUS_RUNNING + task.write_to_db(db.kv_store) + + cluster = db.get_cluster_by_id(task.cluster_id) + snapshot_id = task.function_params["snapshot_id"] + snapshot = db.get_snapshot_by_id(snapshot_id) + logger.info(f"backing up: {snapshot_id}") + if cluster.backup_s3_bucket and cluster.backup_s3_cred: + folder = f"backup-{snapshot_id}" + folder = folder.replace(" ", "-") + folder = folder.replace(":", "-") + folder = folder.split(".")[0] + backup_path = f"blobstore://{cluster.backup_s3_cred}@s3.{cluster.backup_s3_region}.amazonaws.com/{folder}?bucket={cluster.backup_s3_bucket}®ion={cluster.backup_s3_region}&sc=0" + ret, err = node.rpc_client().bdev_lvol_s3_backup(backup_path, snapshot.snap_bdev) + if not ret: + logger.error( + f"Failed to backup snapshot: {snapshot_id}, error: {err}") + + task.function_result = f"snap backup error" + else: + task.function_result = f"snap backup done" + + task.status = JobSchedule.STATUS_DONE + task.write_to_db(db.kv_store) + + +logger.info("Starting Tasks runner fdb backup...") + +while True: + clusters = db.get_clusters() + if not clusters: + logger.error("No clusters found!") + else: + for cl in clusters: + if cl.status == Cluster.STATUS_IN_ACTIVATION: + continue + + tasks = db.get_job_tasks(cl.get_id()) + for task in tasks: + if task.status != JobSchedule.STATUS_DONE: + if task.function_name == JobSchedule.FN_FDB_BACKUP: + process_fdb_backup_task(task) + elif task.function_name == JobSchedule.FN_SNAPSHOT_BACKUP: + process_snap_backup_task(task) + + time.sleep(constants.TASK_EXEC_INTERVAL_SEC) diff --git a/simplyblock_core/test/test_utils.py b/simplyblock_core/test/test_utils.py index 514ba0bbe..5cdb66b1a 100644 --- a/simplyblock_core/test/test_utils.py +++ b/simplyblock_core/test/test_utils.py @@ -1,9 +1,11 @@ import uuid from typing import ContextManager +from unittest.mock import patch import pytest from simplyblock_core import utils, storage_node_ops +from simplyblock_core.db_controller import DBController from simplyblock_core.models.nvme_device import JMDevice, RemoteJMDevice from simplyblock_core.models.storage_node import StorageNode from simplyblock_core.utils import helpers, parse_thread_siblings_list @@ -153,24 +155,26 @@ def test_parse_thread_siblings_list(input, expected): -def test_get_node_jm_names(): +@patch.object(DBController, 'get_jm_device_by_id') +def test_get_node_jm_names(db_controller_get_jm_device_by_id): + jm_devices = [] node_1_jm = JMDevice() node_1_jm.uuid = "node_1_jm_id" node_1_jm.jm_bdev = "node_1_jm" - + jm_devices.append(node_1_jm) node_2_jm = JMDevice() node_2_jm.uuid = "node_2_jm_id" node_2_jm.jm_bdev = "node_2_jm" - + jm_devices.append(node_2_jm) node_3_jm = JMDevice() node_3_jm.uuid = "node_3_jm_id" node_3_jm.jm_bdev = "node_3_jm" - + jm_devices.append(node_3_jm) node_4_jm = JMDevice() node_4_jm.uuid = "node_4_jm_id" node_4_jm.jm_bdev = "node_4_jm" - + jm_devices.append(node_4_jm) node_1 = StorageNode() node_1.uuid = str(uuid.uuid4()) node_1.enable_ha_jm = True @@ -178,6 +182,13 @@ def test_get_node_jm_names(): node_1.jm_device = node_1_jm node_1.jm_ids = ["node_2_jm_id", "node_3_jm_id", "node_4_jm_id"] + def get_jm_device_by_id(jm_id): + for dev in jm_devices: + if dev.uuid == jm_id: + return dev + + db_controller_get_jm_device_by_id.side_effect = get_jm_device_by_id + remote_node = StorageNode() remote_node.uuid = str(uuid.uuid4()) remote_node.enable_ha_jm = True diff --git a/simplyblock_web/api/internal/storage_node/docker.py b/simplyblock_web/api/internal/storage_node/docker.py index af74a76b8..0a384cedb 100644 --- a/simplyblock_web/api/internal/storage_node/docker.py +++ b/simplyblock_web/api/internal/storage_node/docker.py @@ -4,7 +4,6 @@ import math import os from pathlib import Path -import subprocess import time from typing import List, Optional, Union