From fb61d1628383cef6bd33bc62daccec1ddfbae940 Mon Sep 17 00:00:00 2001 From: janmatzek Date: Tue, 7 Oct 2025 13:46:55 +0200 Subject: [PATCH 1/2] feat(gooddata-pipelines): add workspace restore --- gooddata-pipelines/README.md | 7 +- .../gooddata_pipelines/__init__.py | 6 + .../gooddata_pipelines/api/gooddata_api.py | 27 ++ .../backup_and_restore/backup_manager.py | 67 +--- .../backup_and_restore/base_manager.py | 73 ++++ .../backup_and_restore/constants.py | 9 +- .../backup_and_restore/models/storage.py | 96 ++++- .../backup_and_restore/restore_manager.py | 266 ++++++++++++++ .../storage/base_storage.py | 11 + .../storage/local_storage.py | 27 +- .../backup_and_restore/storage/s3_storage.py | 27 +- .../provisioning/utils/utils.py | 11 +- .../gooddata_pipelines/utils/decorators.py | 30 ++ .../gooddata_pipelines/utils/file_utils.py | 63 ++++ .../tests/backup_and_restore/test_backup.py | 7 +- .../tests/backup_and_restore/test_restore.py | 346 ++++++++++++++++++ .../tests/data/restore/test_conf.yaml | 6 + .../data/restore/test_udf_root/filter1.yaml | 7 + .../data/restore/test_udf_root/filter2.yaml | 7 + .../user_data_filters/filter1.yaml | 7 + .../user_data_filters/filter2.yaml | 7 + .../tests/utils/test_decorators.py | 40 ++ 22 files changed, 1044 insertions(+), 103 deletions(-) create mode 100644 gooddata-pipelines/gooddata_pipelines/backup_and_restore/base_manager.py create mode 100644 gooddata-pipelines/gooddata_pipelines/backup_and_restore/restore_manager.py create mode 100644 gooddata-pipelines/gooddata_pipelines/utils/decorators.py create mode 100644 gooddata-pipelines/gooddata_pipelines/utils/file_utils.py create mode 100644 gooddata-pipelines/tests/backup_and_restore/test_restore.py create mode 100644 gooddata-pipelines/tests/data/restore/test_conf.yaml create mode 100644 gooddata-pipelines/tests/data/restore/test_udf_root/filter1.yaml create mode 100644 gooddata-pipelines/tests/data/restore/test_udf_root/filter2.yaml create mode 100644 gooddata-pipelines/tests/data/restore/test_udf_root/user_data_filters/filter1.yaml create mode 100644 gooddata-pipelines/tests/data/restore/test_udf_root/user_data_filters/filter2.yaml create mode 100644 gooddata-pipelines/tests/utils/test_decorators.py diff --git a/gooddata-pipelines/README.md b/gooddata-pipelines/README.md index 1079a9c77..8d57ee55c 100644 --- a/gooddata-pipelines/README.md +++ b/gooddata-pipelines/README.md @@ -10,9 +10,10 @@ You can use the package to manage following resources in GDC: - User/Group permissions - User Data Filters - Child workspaces (incl. Workspace Data Filter settings) -1. _[PLANNED]:_ Backup and restore of workspaces -1. _[PLANNED]:_ Custom fields management - - extend the Logical Data Model of a child workspace +1. Backup and restore of workspaces + - Create and backup snapshots of workspace metadata. +1. LDM Extension + - extend the Logical Data Model of a child workspace with custom datasets and fields In case you are not interested in incorporating a library in your own program but would like to use a ready-made script, consider having a look at [GoodData Productivity Tools](https://github.com/gooddata/gooddata-productivity-tools). diff --git a/gooddata-pipelines/gooddata_pipelines/__init__.py b/gooddata-pipelines/gooddata_pipelines/__init__.py index 37ae75da0..70e96cf55 100644 --- a/gooddata-pipelines/gooddata_pipelines/__init__.py +++ b/gooddata-pipelines/gooddata_pipelines/__init__.py @@ -10,6 +10,10 @@ S3StorageConfig, StorageType, ) +from .backup_and_restore.restore_manager import ( + RestoreManager, + WorkspaceToRestore, +) from .backup_and_restore.storage.local_storage import LocalStorage from .backup_and_restore.storage.s3_storage import S3Storage @@ -57,6 +61,8 @@ __all__ = [ "BackupManager", + "RestoreManager", + "WorkspaceToRestore", "BackupRestoreConfig", "StorageType", "LocalStorage", diff --git a/gooddata-pipelines/gooddata_pipelines/api/gooddata_api.py b/gooddata-pipelines/gooddata_pipelines/api/gooddata_api.py index 59c09490d..daeba2336 100644 --- a/gooddata-pipelines/gooddata_pipelines/api/gooddata_api.py +++ b/gooddata-pipelines/gooddata_pipelines/api/gooddata_api.py @@ -167,6 +167,17 @@ def get_user_data_filters(self, workspace_id: str) -> requests.Response: endpoint = f"/layout/workspaces/{workspace_id}/userDataFilters" return self._get(endpoint) + def put_user_data_filters( + self, workspace_id: str, user_data_filters: dict[str, Any] + ) -> requests.Response: + """Puts the user data filters into GoodData workspace.""" + headers = {**self.headers, "Content-Type": "application/json"} + return self._put( + f"/layout/workspaces/{workspace_id}/userDataFilters", + user_data_filters, + headers, + ) + def get_automations(self, workspace_id: str) -> requests.Response: """Gets the automations for a given workspace.""" endpoint = ( @@ -174,6 +185,22 @@ def get_automations(self, workspace_id: str) -> requests.Response: ) return self._get(endpoint) + def post_automation( + self, workspace_id: str, automation: dict[str, Any] + ) -> requests.Response: + """Posts an automation for a given workspace.""" + endpoint = f"/entities/workspaces/{workspace_id}/automations" + return self._post(endpoint, automation) + + def delete_automation( + self, workspace_id: str, automation_id: str + ) -> requests.Response: + """Deletes an automation for a given workspace.""" + endpoint = ( + f"/entities/workspaces/{workspace_id}/automations/{automation_id}" + ) + return self._delete(endpoint) + def get_all_metrics(self, workspace_id: str) -> requests.Response: """Get all metrics from the specified workspace. diff --git a/gooddata-pipelines/gooddata_pipelines/backup_and_restore/backup_manager.py b/gooddata-pipelines/gooddata_pipelines/backup_and_restore/backup_manager.py index 8d6ea4007..616126bca 100644 --- a/gooddata-pipelines/gooddata_pipelines/backup_and_restore/backup_manager.py +++ b/gooddata-pipelines/gooddata_pipelines/backup_and_restore/backup_manager.py @@ -1,23 +1,20 @@ # (C) 2025 GoodData Corporation -import json import os import shutil import tempfile import time import traceback from pathlib import Path -from typing import Any, Type +from typing import Any import attrs import requests -import yaml -from gooddata_sdk.utils import PROFILES_FILE_PATH, profile_content -from gooddata_pipelines.api.gooddata_api_wrapper import GoodDataApi from gooddata_pipelines.backup_and_restore.backup_input_processor import ( BackupInputProcessor, ) +from gooddata_pipelines.backup_and_restore.base_manager import BaseManager from gooddata_pipelines.backup_and_restore.constants import ( BackupSettings, DirNames, @@ -25,18 +22,10 @@ from gooddata_pipelines.backup_and_restore.models.input_type import InputType from gooddata_pipelines.backup_and_restore.models.storage import ( BackupRestoreConfig, - StorageType, ) from gooddata_pipelines.backup_and_restore.storage.base_storage import ( BackupStorage, ) -from gooddata_pipelines.backup_and_restore.storage.local_storage import ( - LocalStorage, -) -from gooddata_pipelines.backup_and_restore.storage.s3_storage import ( - S3Storage, -) -from gooddata_pipelines.logger import LogObserver from gooddata_pipelines.utils.rate_limiter import RateLimiter @@ -45,16 +34,12 @@ class BackupBatch: list_of_ids: list[str] -class BackupManager: +class BackupManager(BaseManager): storage: BackupStorage def __init__(self, host: str, token: str, config: BackupRestoreConfig): - self._api = GoodDataApi(host, token) - self.logger = LogObserver() - - self.config = config + super().__init__(host, token, config) - self.storage = self._get_storage(self.config) self.org_id = self._api.get_organization_id() self.loader = BackupInputProcessor(self._api, self.config.api_page_size) @@ -63,39 +48,6 @@ def __init__(self, host: str, token: str, config: BackupRestoreConfig): calls_per_second=self.config.api_calls_per_second, ) - @classmethod - def create( - cls: Type["BackupManager"], - config: BackupRestoreConfig, - host: str, - token: str, - ) -> "BackupManager": - """Creates a backup worker instance using the provided host and token.""" - return cls(host=host, token=token, config=config) - - @classmethod - def create_from_profile( - cls: Type["BackupManager"], - config: BackupRestoreConfig, - profile: str = "default", - profiles_path: Path = PROFILES_FILE_PATH, - ) -> "BackupManager": - """Creates a backup worker instance using a GoodData profile file.""" - content = profile_content(profile, profiles_path) - return cls(**content, config=config) - - @staticmethod - def _get_storage(conf: BackupRestoreConfig) -> BackupStorage: - """Returns the storage class based on the storage type.""" - if conf.storage_type == StorageType.S3: - return S3Storage(conf) - elif conf.storage_type == StorageType.LOCAL: - return LocalStorage(conf) - else: - raise RuntimeError( - f'Unsupported storage type "{conf.storage_type.value}".' - ) - def get_user_data_filters(self, ws_id: str) -> dict: """Returns the user data filters for the specified workspace.""" with self._api_rate_limiter: @@ -133,19 +85,13 @@ def _store_user_data_filters( "user_data_filters", filter["id"] + ".yaml", ) - self._write_to_yaml(udf_file_path, filter) + self.yaml_utils.dump(udf_file_path, filter) @staticmethod def _move_folder(source: Path, destination: Path) -> None: """Moves the source folder to the destination.""" shutil.move(source, destination) - @staticmethod - def _write_to_yaml(path: str, source: Any) -> None: - """Writes the source to a YAML file.""" - with open(path, "w") as outfile: - yaml.dump(source, outfile) - def _get_automations_from_api(self, workspace_id: str) -> Any: """Returns automations for the workspace as JSON.""" with self._api_rate_limiter: @@ -182,8 +128,7 @@ def _store_automations(self, export_path: Path, workspace_id: str) -> None: # Store the automations in a JSON file if len(automations["data"]) > 0: - with open(automations_file_path, "w") as f: - json.dump(automations, f) + self.json_utils.dump(automations_file_path, automations) def store_declarative_filter_views( self, export_path: Path, workspace_id: str diff --git a/gooddata-pipelines/gooddata_pipelines/backup_and_restore/base_manager.py b/gooddata-pipelines/gooddata_pipelines/backup_and_restore/base_manager.py new file mode 100644 index 000000000..1de748250 --- /dev/null +++ b/gooddata-pipelines/gooddata_pipelines/backup_and_restore/base_manager.py @@ -0,0 +1,73 @@ +# (C) 2025 GoodData Corporation + +import abc +from pathlib import Path +from typing import Type, TypeVar + +from gooddata_sdk.utils import PROFILES_FILE_PATH, profile_content + +from gooddata_pipelines.api.gooddata_api_wrapper import GoodDataApi +from gooddata_pipelines.backup_and_restore.models.storage import ( + BackupRestoreConfig, + StorageType, +) +from gooddata_pipelines.backup_and_restore.storage.base_storage import ( + BackupStorage, +) +from gooddata_pipelines.backup_and_restore.storage.local_storage import ( + LocalStorage, +) +from gooddata_pipelines.backup_and_restore.storage.s3_storage import S3Storage +from gooddata_pipelines.logger import LogObserver +from gooddata_pipelines.utils.file_utils import JsonUtils, YamlUtils + +ManagerT = TypeVar("ManagerT", bound="BaseManager") + + +class BaseManager(abc.ABC): + """Base class to provide constructors for backup and restore managers.""" + + storage: BackupStorage + + def __init__(self, host: str, token: str, config: BackupRestoreConfig): + self.config = config + + self._api: GoodDataApi = GoodDataApi(host, token) + self.logger: LogObserver = LogObserver() + + self.storage = self._get_storage(self.config) + + self.yaml_utils = YamlUtils() + self.json_utils = JsonUtils() + + def _get_storage(self, conf: BackupRestoreConfig) -> BackupStorage: + """Returns the storage class based on the storage type.""" + if conf.storage_type == StorageType.S3: + return S3Storage(conf) + elif conf.storage_type == StorageType.LOCAL: + return LocalStorage(conf) + else: + raise RuntimeError( + f'Unsupported storage type "{conf.storage_type.value}".' + ) + + @classmethod + def create( + cls: Type[ManagerT], + config: BackupRestoreConfig, + host: str, + token: str, + ) -> ManagerT: + """Creates a backup worker instance using the provided host and token.""" + return cls(host=host, token=token, config=config) + + @classmethod + def create_from_profile( + cls: Type[ManagerT], + config: BackupRestoreConfig, + profile: str = "default", + profiles_path: Path = PROFILES_FILE_PATH, + ) -> ManagerT: + """Creates a backup worker instance using a GoodData profile file.""" + content = profile_content(profile, profiles_path) + return cls(host=content["host"], token=content["token"], config=config) diff --git a/gooddata-pipelines/gooddata_pipelines/backup_and_restore/constants.py b/gooddata-pipelines/gooddata_pipelines/backup_and_restore/constants.py index 900dff511..b663816c3 100644 --- a/gooddata-pipelines/gooddata_pipelines/backup_and_restore/constants.py +++ b/gooddata-pipelines/gooddata_pipelines/backup_and_restore/constants.py @@ -23,13 +23,14 @@ class DirNames: @attrs.frozen class ApiDefaults: - DEFAULT_PAGE_SIZE = 100 - DEFAULT_BATCH_SIZE = 100 - DEFAULT_API_CALLS_PER_SECOND = 1.0 + PAGE_SIZE = 100 + BATCH_SIZE = 100 + CALLS_PER_SECOND = 1.0 @attrs.frozen -class BackupSettings(ApiDefaults): +class BackupSettings: + API = ApiDefaults() MAX_RETRIES = 3 RETRY_DELAY = 5 # seconds TIMESTAMP_SDK_FOLDER = ( diff --git a/gooddata-pipelines/gooddata_pipelines/backup_and_restore/models/storage.py b/gooddata-pipelines/gooddata_pipelines/backup_and_restore/models/storage.py index 83c2d6056..f75c79c2c 100644 --- a/gooddata-pipelines/gooddata_pipelines/backup_and_restore/models/storage.py +++ b/gooddata-pipelines/gooddata_pipelines/backup_and_restore/models/storage.py @@ -1,10 +1,10 @@ # (C) 2025 GoodData Corporation from enum import Enum -from typing import Annotated, TypeAlias, Optional +from typing import Annotated import yaml -from pydantic import BaseModel, Field +from pydantic import BaseModel, Field, model_validator from gooddata_pipelines.backup_and_restore.constants import BackupSettings @@ -17,18 +17,32 @@ class StorageType(Enum): class S3StorageConfig(BaseModel): - """Configuration for S3 storage.""" + """Configuration for S3 storage. + + Can be created using the following constructor methods: + - `from_iam_role` + - `from_aws_credentials` + - `from_aws_profile` + """ backup_path: str bucket: str - profile: Optional[str] = None - aws_access_key_id: Optional[str] = None - aws_secret_access_key: Optional[str] = None - aws_default_region: Optional[str] = "us-east-1" + profile: str | None = None + aws_access_key_id: str | None = None + aws_secret_access_key: str | None = None + aws_default_region: str = "us-east-1" @classmethod def from_iam_role(cls, backup_path: str, bucket: str) -> "S3StorageConfig": - """Use default IAM role or environment credentials.""" + """Use default IAM role or environment credentials. + + Args: + backup_path: The path to the backup directory. + bucket: The name of the S3 bucket. + + Returns: + S3StorageConfig: The S3 storage configuration. + """ return cls(backup_path=backup_path, bucket=bucket) @classmethod @@ -40,7 +54,18 @@ def from_aws_credentials( aws_secret_access_key: str, aws_default_region: str, ) -> "S3StorageConfig": - """Use explicit AWS access keys and region.""" + """Use explicit AWS access keys and region. + + Args: + backup_path: The path to the backup directory. + bucket: The name of the S3 bucket. + aws_access_key_id: The AWS access key ID. + aws_secret_access_key: The AWS secret access key. + aws_default_region: The AWS default region. + + Returns: + S3StorageConfig: The S3 storage configuration. + """ return cls( backup_path=backup_path, bucket=bucket, @@ -53,46 +78,79 @@ def from_aws_credentials( def from_aws_profile( cls, backup_path: str, bucket: str, profile: str ) -> "S3StorageConfig": - """Use a named AWS CLI profile.""" + """Use a named AWS CLI profile. + + Args: + backup_path: The path to the backup directory. + bucket: The name of the S3 bucket. + profile: The name of the AWS profile. + + Returns: + S3StorageConfig: The S3 storage configuration. + """ return cls(backup_path=backup_path, bucket=bucket, profile=profile) class LocalStorageConfig(BaseModel): """Placeholder for local storage config.""" - -StorageConfig: TypeAlias = S3StorageConfig | LocalStorageConfig + backup_path: str = Field(default="local_backups") class BackupRestoreConfig(BaseModel): - """Configuration for backup and restore.""" - - storage_type: StorageType - storage: StorageConfig | None = Field(default=None) + """Configuration for backup and restore. + + Args: + storage_type: The type of storage to use. Defaults to `StorageType.LOCAL`. + storage: Storage configuration. Either `S3StorageConfig` or `LocalStorageConfig`. Defaults to `LocalStorageConfig()`. + api_page_size: The page size for fetching workspace relationships. Defaults to `BackupSettings.API.PAGE_SIZE`. + batch_size: The batch size for fetching workspace relationships. Defaults to `BackupSettings.API.BATCH_SIZE`. + api_calls_per_second: The maximum API calls per second (rate limiting). Defaults to `BackupSettings.API.CALLS_PER_SECOND`. + """ + + storage_type: StorageType = Field(default=StorageType.LOCAL) + storage: S3StorageConfig | LocalStorageConfig = Field( + default_factory=LocalStorageConfig + ) api_page_size: Annotated[ int, Field( gt=0, description="Page size must be greater than 0", ), - ] = Field(default=BackupSettings.DEFAULT_PAGE_SIZE) + ] = Field(default=BackupSettings.API.PAGE_SIZE) batch_size: Annotated[ int, Field( gt=0, description="Batch size must be greater than 0", ), - ] = Field(default=BackupSettings.DEFAULT_BATCH_SIZE) + ] = Field(default=BackupSettings.API.BATCH_SIZE) api_calls_per_second: Annotated[ float, Field( gt=0, description="Maximum API calls per second (rate limiting)", ), - ] = Field(default=BackupSettings.DEFAULT_API_CALLS_PER_SECOND) + ] = Field(default=BackupSettings.API.CALLS_PER_SECOND) @classmethod def from_yaml(cls, conf_path: str) -> "BackupRestoreConfig": with open(conf_path, "r") as stream: conf: dict = yaml.safe_load(stream) return cls(**conf) + + @model_validator(mode="after") + def validate_storage(self) -> "BackupRestoreConfig": + """Check that the storage gets correct configuration when using S3 storage""" + if self.storage_type == StorageType.S3: + if not isinstance(self.storage, S3StorageConfig): + raise ValueError( + "S3 storage must be configured with S3StorageConfig object" + ) + elif self.storage_type == StorageType.LOCAL: + if not isinstance(self.storage, LocalStorageConfig): + raise ValueError( + "Local storage must be configured with LocalStorageConfig object" + ) + return self diff --git a/gooddata-pipelines/gooddata_pipelines/backup_and_restore/restore_manager.py b/gooddata-pipelines/gooddata_pipelines/backup_and_restore/restore_manager.py new file mode 100644 index 000000000..677d81ac3 --- /dev/null +++ b/gooddata-pipelines/gooddata_pipelines/backup_and_restore/restore_manager.py @@ -0,0 +1,266 @@ +# (C) 2025 GoodData Corporation + +import os +import tempfile +import zipfile +from pathlib import Path +from typing import Any + +import attrs +from gooddata_sdk.catalog.workspace.declarative_model.workspace.analytics_model.analytics_model import ( + CatalogDeclarativeAnalytics, +) +from gooddata_sdk.catalog.workspace.declarative_model.workspace.automation import ( + CatalogDeclarativeAutomation, +) +from gooddata_sdk.catalog.workspace.declarative_model.workspace.logical_model.ldm import ( + CatalogDeclarativeModel, +) +from gooddata_sdk.catalog.workspace.declarative_model.workspace.workspace import ( + CatalogDeclarativeFilterView, +) +from pydantic import BaseModel, ConfigDict + +from gooddata_pipelines.backup_and_restore.base_manager import BaseManager +from gooddata_pipelines.backup_and_restore.constants import DirNames +from gooddata_pipelines.utils.decorators import log_and_reraise_exception + + +@attrs.define +class WorkspaceModel: + logical_data_model: CatalogDeclarativeModel + analytics_model: CatalogDeclarativeAnalytics + + +class WorkspaceToRestore(BaseModel): + """Workspace to restore. + + Args: + id: The ID of the workspace to restore. + path: The path to the folder containing the `gooddata_layouts.zip` file + to restore. Should be a continuation of the `backup_path` specified + in the storage configuration. Typically, it would look something like + `organization_id/workspace_id/backup_timestamp` + """ + + model_config = ConfigDict(extra="forbid") + + id: str + path: str + + +class RestoreManager(BaseManager): + """Restores previsouly created backups of workspace metadata.""" + + @log_and_reraise_exception("Failed to extract backup from zip archive.") + def _extract_zip_archive( + self, file_to_extract: Path, destination: Path + ) -> None: + """Extracts the backup from zip archive.""" + with zipfile.ZipFile(file_to_extract, "r") as zip_ref: + zip_ref.extractall(destination) + + def _check_workspace_is_valid(self, workspace_root_dir_path: Path) -> None: + """Checks if the workspace layout is valid.""" + if ( + not workspace_root_dir_path.exists() + or not workspace_root_dir_path.is_dir() + ): + self.logger.error( + "Invalid source path found upon backup fetch. " + f"Got {workspace_root_dir_path}. " + "Check if target zip contains gooddata_layouts directory." + ) + raise RuntimeError("Invalid source path upon load.") + + children = list(workspace_root_dir_path.iterdir()) + am_path = workspace_root_dir_path / DirNames.AM + ldm_path = workspace_root_dir_path / DirNames.LDM + udf_path = workspace_root_dir_path / DirNames.UDF + + if ( + am_path not in children + or ldm_path not in children + or udf_path not in children + ): + self.logger.error( + f"{DirNames.AM} or {DirNames.LDM} directory missing in the " + + "workspace hierarchy. " + ) + raise RuntimeError( + f"{DirNames.AM} or {DirNames.LDM} directory missing." + ) + + @log_and_reraise_exception("Failed to load workspace declaration.") + def _load_workspace_layout( + self, workspace_root_dir_path: Path + ) -> WorkspaceModel: + """Loads the workspace layout from the backup.""" + sdk_catalog = self._api._sdk.catalog_workspace_content + + ldm = sdk_catalog.load_ldm_from_disk(workspace_root_dir_path) + am = sdk_catalog.load_analytics_model_from_disk(workspace_root_dir_path) + + return WorkspaceModel(logical_data_model=ldm, analytics_model=am) + + @log_and_reraise_exception("Failed to load user data filters from folder.") + def _load_user_data_filters(self, workspace_root_dir_path: Path) -> dict: + user_data_filters: dict = {"userDataFilters": []} + user_data_filters_folder = os.path.join( + workspace_root_dir_path, DirNames.UDF + ) + for filename in os.listdir(user_data_filters_folder): + file_path = os.path.join(user_data_filters_folder, filename) + user_data_filter = self.yaml_utils.safe_load(Path(file_path)) + user_data_filters["userDataFilters"].append(user_data_filter) + + return user_data_filters + + @log_and_reraise_exception("Failed to put workspace layout into GoodData.") + def _put_workspace_layout( + self, workspace_id: str, workspace_model: WorkspaceModel + ) -> None: + """Puts the workspace layout into GoodData.""" + self._api._sdk.catalog_workspace_content.put_declarative_ldm( + workspace_id, workspace_model.logical_data_model + ) + self._api._sdk.catalog_workspace_content.put_declarative_analytics_model( + workspace_id, workspace_model.analytics_model + ) + + @log_and_reraise_exception("Failed to put user data filters.") + def _put_user_data_filters( + self, workspace_id: str, user_data_filters: dict + ) -> None: + """Puts the user data filters into GoodData workspace.""" + response = self._api.put_user_data_filters( + workspace_id, user_data_filters + ) + self._api.raise_if_response_not_ok(response) + + def _load_and_put_filter_views( + self, workspace_id: str, workspace_root_dir_path: Path + ) -> None: + """Loads and puts filter views into GoodData workspace.""" + filter_views: list[CatalogDeclarativeFilterView] = [] + if not (workspace_root_dir_path / "filter_views").exists(): + # Skip if the filter_views directory does not exist + return + + for file in Path(workspace_root_dir_path / "filter_views").iterdir(): + filter_view_content: dict[str, Any] = dict( + self.yaml_utils.safe_load(file) + ) + filter_view: CatalogDeclarativeFilterView = ( + CatalogDeclarativeFilterView.from_dict(filter_view_content) + ) + filter_views.append(filter_view) + + if filter_views: + self._api._sdk.catalog_workspace.put_declarative_filter_views( + workspace_id, filter_views + ) + + def _load_and_post_automations( + self, workspace_id: str, workspace_root_dir_path: Path + ) -> None: + """Loads automations from specified json file and creates them in the workspace.""" + # Load automations from JSON + path_to_json: Path = Path( + workspace_root_dir_path, "automations", "automations.json" + ) + + # Both the folder and the file must exist, otherwise skip + if not (workspace_root_dir_path.exists() and path_to_json.exists()): + return + + # Delete all automations from the workspace and restore the automations from the backup. + self._delete_all_automations(workspace_id) + + data: dict = self.json_utils.load(path_to_json) + automations: list[dict] = data["data"] + + for automation in automations: + self._post_automation(workspace_id, automation) + + def _delete_all_automations(self, workspace_id: str) -> None: + """Deletes all automations in the workspace.""" + automations: list[CatalogDeclarativeAutomation] = ( + self._api._sdk.catalog_workspace.get_declarative_automations( + workspace_id + ) + ) + for automation in automations: + self._api.delete_automation(workspace_id, automation.id) + + def _post_automation(self, workspace_id: str, automation: dict) -> None: + """Posts a scheduled export to the workspace.""" + attributes: dict = automation["attributes"] + relationships: dict = automation["relationships"] + id: str = automation["id"] + + if attributes.get("schedule"): + if attributes["schedule"].get("cronDescription"): + # The cron description attribute is causing a 500 ("No mapping found...") + # error. Known and reported issue. + del attributes["schedule"]["cronDescription"] + + data = { + "data": { + "attributes": attributes, + "id": id, + "type": "automation", + "relationships": relationships, + } + } + + response = self._api.post_automation(workspace_id, data) + + if not response.ok: + self.logger.error( + f"Failed to post automation ({response.status_code}): {response.text}" + ) + + def _restore_backup( + self, workspace_to_restore: WorkspaceToRestore, tempdir_path: Path + ) -> None: + """Restores the backup of a workspace.""" + + zip_target = tempdir_path / f"{DirNames.LAYOUTS}.zip" + src_path = tempdir_path / DirNames.LAYOUTS + + try: + self.storage.get_ws_declaration( + workspace_to_restore.path, str(zip_target) + ) + self._extract_zip_archive(zip_target, tempdir_path) + self._check_workspace_is_valid(src_path) + workspace_model: WorkspaceModel = self._load_workspace_layout( + src_path + ) + user_data_filters = self._load_user_data_filters(src_path) + self._put_workspace_layout(workspace_to_restore.id, workspace_model) + self._put_user_data_filters( + workspace_to_restore.id, user_data_filters + ) + self._load_and_put_filter_views(workspace_to_restore.id, src_path) + self._load_and_post_automations(workspace_to_restore.id, src_path) + self.logger.info( + f"Finished backup restore of {workspace_to_restore.id} from {workspace_to_restore.path}." + ) + except Exception as e: + self.logger.error( + f"Failed to restore backup of {workspace_to_restore.id} from {workspace_to_restore.path}. " + f"Error caused by {e.__class__.__name__}: {e}." + ) + + def restore(self, workspaces_to_restore: list[WorkspaceToRestore]) -> None: + """Restores the backups of workspaces. + + Args: + workspaces_to_restore: List of workspaces to restore. + """ + for workspace_to_restore in workspaces_to_restore: + with tempfile.TemporaryDirectory() as tempdir: + tempdir_path = Path(tempdir) + self._restore_backup(workspace_to_restore, tempdir_path) diff --git a/gooddata-pipelines/gooddata_pipelines/backup_and_restore/storage/base_storage.py b/gooddata-pipelines/gooddata_pipelines/backup_and_restore/storage/base_storage.py index 313199711..d3ea20167 100644 --- a/gooddata-pipelines/gooddata_pipelines/backup_and_restore/storage/base_storage.py +++ b/gooddata-pipelines/gooddata_pipelines/backup_and_restore/storage/base_storage.py @@ -12,7 +12,18 @@ class BackupStorage(abc.ABC): def __init__(self, conf: BackupRestoreConfig): self.logger = LogObserver() + suffix = "/" if not conf.storage.backup_path.endswith("/") else "" + self._backup_path = conf.storage.backup_path + suffix + @abc.abstractmethod def export(self, folder: str, org_id: str) -> None: """Exports the content of the folder to the storage.""" raise NotImplementedError + + @abc.abstractmethod + def get_ws_declaration( + self, target_path: str, local_target_path: str + ) -> None: + raise NotImplementedError( + "This method should be implemented by the subclass." + ) diff --git a/gooddata-pipelines/gooddata_pipelines/backup_and_restore/storage/local_storage.py b/gooddata-pipelines/gooddata_pipelines/backup_and_restore/storage/local_storage.py index 83d4ea0b8..b5760be61 100644 --- a/gooddata-pipelines/gooddata_pipelines/backup_and_restore/storage/local_storage.py +++ b/gooddata-pipelines/gooddata_pipelines/backup_and_restore/storage/local_storage.py @@ -5,6 +5,7 @@ from gooddata_pipelines.backup_and_restore.models.storage import ( BackupRestoreConfig, + LocalStorageConfig, ) from gooddata_pipelines.backup_and_restore.storage.base_storage import ( BackupStorage, @@ -14,24 +15,34 @@ class LocalStorage(BackupStorage): def __init__(self, conf: BackupRestoreConfig): super().__init__(conf) + if not isinstance(conf.storage, LocalStorageConfig): + raise ValueError("Local storage config is required") + self._config: LocalStorageConfig = conf.storage - def _export( - self, folder: str, org_id: str, export_folder: str = "local_backups" - ) -> None: + def _export(self, folder: str, org_id: str, export_folder: str) -> None: """Copies the content of the folder to local storage as backup.""" + self.logger.info(f"Saving {org_id} to local storage") shutil.copytree( Path(folder), Path(Path.cwd(), export_folder), dirs_exist_ok=True ) - def export( - self, folder: str, org_id: str, export_folder: str = "local_backups" - ) -> None: + def export(self, folder: str, org_id: str) -> None: """Copies the content of the folder to local storage as backup.""" try: - self._export(folder, org_id, export_folder) + self._export(folder, org_id, self._config.backup_path) except Exception as e: self.logger.error( - f"Error exporting {folder} to {export_folder}: {e}" + f"Error exporting {folder} to {self._config.backup_path}: {e}" ) raise + + def get_ws_declaration( + self, target_path: str, local_target_path: str + ) -> None: + """Retrieves workspace declaration from local storage and copies to the local target path. + + The local target should be a temporary directory. + """ + file_to_copy = self._backup_path + target_path + "/gooddata_layouts.zip" + shutil.copy(file_to_copy, local_target_path) diff --git a/gooddata-pipelines/gooddata_pipelines/backup_and_restore/storage/s3_storage.py b/gooddata-pipelines/gooddata_pipelines/backup_and_restore/storage/s3_storage.py index 7a136a3e3..4205c2827 100644 --- a/gooddata-pipelines/gooddata_pipelines/backup_and_restore/storage/s3_storage.py +++ b/gooddata-pipelines/gooddata_pipelines/backup_and_restore/storage/s3_storage.py @@ -25,8 +25,6 @@ def __init__(self, conf: BackupRestoreConfig): self._client = self._session.client("s3") self._resource = self._session.resource("s3") self._bucket = self._resource.Bucket(self._config.bucket) # type: ignore [missing library stubs] - suffix = "/" if not self._config.backup_path.endswith("/") else "" - self._backup_path = self._config.backup_path + suffix self._verify_connection() @@ -95,3 +93,28 @@ def export(self, folder: str, org_id: str) -> None: self._client.put_object( Bucket=self._config.bucket, Key=export_path, Body=data ) + + def get_ws_declaration( + self, target_path: str, local_target_path: str + ) -> None: + """Retrieves workspace declaration from S3 bucket.""" + target_s3_prefix = f"{self._backup_path}{target_path}" + + objs_found = list(self._bucket.objects.filter(Prefix=target_s3_prefix)) + + # Remove the included directory (which equals prefix) on hit + objs_found = objs_found[1:] if len(objs_found) > 0 else objs_found + + if not objs_found: + message = f"No target backup found for {target_s3_prefix}." + self.logger.error(message) + raise Exception(message) + + if len(objs_found) > 1: + self.logger.warning( + f"Multiple backups found at {target_s3_prefix}." + " Continuing with the first one, ignoring the rest..." + ) + + s3_obj = objs_found[0] + self._bucket.download_file(s3_obj.key, local_target_path) diff --git a/gooddata-pipelines/gooddata_pipelines/provisioning/utils/utils.py b/gooddata-pipelines/gooddata_pipelines/provisioning/utils/utils.py index c8bf2af44..e042b792f 100644 --- a/gooddata-pipelines/gooddata_pipelines/provisioning/utils/utils.py +++ b/gooddata-pipelines/gooddata_pipelines/provisioning/utils/utils.py @@ -26,10 +26,11 @@ def get_attrs( Returns: dict: Returns a dictionary of the objects' attributes. """ - # TODO: This might not work great with nested objects, values which are lists of objects etc. - # If we care about parsing the logs back from the string, we should consider some other approach attributes: dict[str, str] = {} - for context_object in objects: + for index, context_object in enumerate(objects): + if isinstance(context_object, str): + attributes[f"string_context_{index}"] = context_object + if isinstance(context_object, Response): # for request.Response objects, keys need to be renamed to match the log schema attributes.update( @@ -48,10 +49,12 @@ def get_attrs( cast(attrs.AttrsInstance, context_object) ).items(): self._add_to_dict(attributes, key, value) - else: + elif hasattr(context_object, "__dict__"): # Generic handling for other objects for key, value in context_object.__dict__.items(): self._add_to_dict(attributes, key, value) + else: + attributes[f"object_{index}"] = str(context_object) if overrides: attributes.update(overrides) diff --git a/gooddata-pipelines/gooddata_pipelines/utils/decorators.py b/gooddata-pipelines/gooddata_pipelines/utils/decorators.py new file mode 100644 index 000000000..13ade311b --- /dev/null +++ b/gooddata-pipelines/gooddata_pipelines/utils/decorators.py @@ -0,0 +1,30 @@ +# (C) 2025 GoodData Corporation + +from typing import Any, Callable + +from gooddata_pipelines.logger.logger import LogObserver + +logger: LogObserver = LogObserver() + + +def log_and_reraise_exception(message: str) -> Callable: + """ + Decorator to log an exception and re-raise it. + + Args: + message (str): The message to log. + """ + + def decorator(fn: Callable) -> Callable: + def wrapper(*method_args: Any, **method_kwargs: Any) -> Callable: + try: + return fn(*method_args, **method_kwargs) + except Exception: + logger.error( + f"{message}, {fn.__name__}, Args: {method_args}, Kwargs: {method_kwargs}" + ) + raise + + return wrapper + + return decorator diff --git a/gooddata-pipelines/gooddata_pipelines/utils/file_utils.py b/gooddata-pipelines/gooddata_pipelines/utils/file_utils.py new file mode 100644 index 000000000..a88681eb0 --- /dev/null +++ b/gooddata-pipelines/gooddata_pipelines/utils/file_utils.py @@ -0,0 +1,63 @@ +# (C) 2025 GoodData Corporation + +import json +from pathlib import Path +from typing import Any + +import attrs +import yaml + + +class PathUtils: + """Handles common path operations.""" + + @staticmethod + def validate_path(path: str | Path) -> Path: + """Validates a path.""" + if not isinstance(path, Path): + path = Path(path) + + return path + + def check_path_exists(self, path: Path) -> None: + """Checks if a path exists.""" + if not self.validate_path(path).exists(): + raise FileNotFoundError(f"File {path} does not exist.") + + +@attrs.define +class JsonUtils: + """Handles common JSON interactions.""" + + path_utils: PathUtils = attrs.field(factory=PathUtils) + + def load(self, path: Path) -> Any: + """Loads a JSON file.""" + self.path_utils.check_path_exists(path) + + with open(path, "r") as f: + return json.load(f) + + def dump(self, path: Path, data: Any) -> None: + """Writes the source to a JSON file.""" + with open(path, "w") as output_file: + json.dump(data, output_file) + + +@attrs.define +class YamlUtils: + """Handles common YMAL interactions.""" + + path_utils: PathUtils = attrs.field(factory=PathUtils) + + def safe_load(self, path: Path) -> Any: + """Safe loads a YAML file.""" + self.path_utils.check_path_exists(path) + + with open(path, "r") as f: + return yaml.safe_load(f) + + def dump(self, path: str, data: Any) -> None: + """Writes the source to a YAML file.""" + with open(path, "w") as output_file: + yaml.dump(data, output_file) diff --git a/gooddata-pipelines/tests/backup_and_restore/test_backup.py b/gooddata-pipelines/tests/backup_and_restore/test_backup.py index 4e425550b..f48abb12e 100644 --- a/gooddata-pipelines/tests/backup_and_restore/test_backup.py +++ b/gooddata-pipelines/tests/backup_and_restore/test_backup.py @@ -17,6 +17,7 @@ from gooddata_pipelines.backup_and_restore.constants import BackupSettings from gooddata_pipelines.backup_and_restore.models.storage import ( BackupRestoreConfig, + LocalStorageConfig, S3StorageConfig, StorageType, ) @@ -31,7 +32,10 @@ S3_BACKUP_PATH = "some/s3/backup/path/org_id/" S3_BUCKET = "some-s3-bucket" -LOCAL_CONFIG = BackupRestoreConfig(storage_type=StorageType.LOCAL) +LOCAL_CONFIG = BackupRestoreConfig( + storage_type=StorageType.LOCAL, + storage=LocalStorageConfig(backup_path=f"{TEST_DATA_DIR}/local_export"), +) S3_CONFIG = BackupRestoreConfig( storage_type=StorageType.S3, @@ -249,7 +253,6 @@ def test_local_storage_export(backup_manager): local_storage.export( folder=tmpdir, org_id="services", - export_folder=f"{TEST_DATA_DIR}/local_export", ) local_export_folder_exist = os.path.isdir( diff --git a/gooddata-pipelines/tests/backup_and_restore/test_restore.py b/gooddata-pipelines/tests/backup_and_restore/test_restore.py new file mode 100644 index 000000000..4bce35a8b --- /dev/null +++ b/gooddata-pipelines/tests/backup_and_restore/test_restore.py @@ -0,0 +1,346 @@ +# (C) 2025 GoodData Corporation + +import json +import os +import shutil +import tempfile +from pathlib import Path + +import pytest + +from gooddata_pipelines.backup_and_restore.constants import DirNames +from gooddata_pipelines.backup_and_restore.models.storage import ( + BackupRestoreConfig, + S3StorageConfig, + StorageType, +) +from gooddata_pipelines.backup_and_restore.restore_manager import ( + RestoreManager, + WorkspaceModel, + WorkspaceToRestore, +) +from gooddata_pipelines.backup_and_restore.storage.s3_storage import S3Storage +from tests.conftest import TEST_DATA_DIR + +TEST_DATA_SUBDIR = f"{TEST_DATA_DIR}/restore" + +MOCK_DL_TARGET = Path("overlays.zip") +TEST_CONF_PATH = f"{TEST_DATA_SUBDIR}/test_conf.yaml" +TEST_UDF_ROOT = Path(f"{TEST_DATA_SUBDIR}/test_udf_root").absolute() + +S3_BACKUP_PATH = "some/s3/backup/path/org_id/" +S3_BUCKET = "some-s3-bucket" + + +# Small reusable fixtures to avoid repeated mocking +@pytest.fixture() +def conf_s3() -> BackupRestoreConfig: + return BackupRestoreConfig.from_yaml(TEST_CONF_PATH) + + +@pytest.fixture() +def s3_storage(mocker, conf_s3: BackupRestoreConfig) -> S3Storage: + mocker.patch.object(S3Storage, "_verify_connection", return_value=None) + storage = S3Storage(conf_s3) + return storage + + +@pytest.fixture() +def s3_bucket(mocker, s3_storage: S3Storage): + bucket = mocker.MagicMock() + s3_storage._bucket = bucket # type: ignore[attr-defined] + return bucket + + +@pytest.fixture() +def gd_api_instance(mocker): + gd_api_cls = mocker.patch( + "gooddata_pipelines.backup_and_restore.base_manager.GoodDataApi", + ) + gd_api_instance = gd_api_cls.return_value + gd_api_instance._sdk = mocker.MagicMock() + return gd_api_instance + + +@pytest.fixture() +def restore_manager(mocker, gd_api_instance) -> RestoreManager: # noqa: ARG001 + mocker.patch( + "gooddata_pipelines.backup_and_restore.base_manager.S3Storage", + autospec=True, + ) + config = BackupRestoreConfig( + storage_type=StorageType.S3, + storage=S3StorageConfig(backup_path=S3_BACKUP_PATH, bucket=S3_BUCKET), + ) + manager = RestoreManager.create(host="host", token="token", config=config) + return manager + + +def assert_not_called_with(target, *args, **kwargs): + try: + target.assert_called_with(*args, **kwargs) + except AssertionError: + return + formatted_call = target._format_mock_call_signature(args, kwargs) + raise AssertionError(f"Expected {formatted_call} to not have been called.") + + +def test_s3_storage_success(s3_storage, s3_bucket, mocker): + """S3Storage: downloads the first zip object when objects are present.""" + dir_marker = mocker.MagicMock() + zip_obj = mocker.MagicMock() + zip_obj.key = f"{S3_BACKUP_PATH}ws_id/gooddata_layouts.zip" + s3_bucket.objects.filter.return_value = [dir_marker, zip_obj] + + with tempfile.TemporaryDirectory() as tempdir: + target_path = Path(tempdir, MOCK_DL_TARGET) + s3_storage.get_ws_declaration("ws_id/", target_path) + s3_bucket.download_file.assert_called_once_with( + zip_obj.key, target_path + ) + + +def test_s3_storage_no_target_only_dir(s3_storage, s3_bucket, mocker): + """S3Storage: raises when only a directory marker exists under the prefix.""" + s3_bucket.objects.filter.return_value = [mocker.MagicMock()] + with pytest.raises(Exception): + s3_storage.get_ws_declaration("ws_id/", MOCK_DL_TARGET) + + +def test_s3_storage_no_target(s3_storage, s3_bucket, mocker): + """S3Storage: raises when no objects exist for the given prefix.""" + s3_bucket.objects.filter.return_value = [] + with pytest.raises(Exception): + s3_storage.get_ws_declaration("bad_target/", MOCK_DL_TARGET) + + +def test_restore_empty_workspace(restore_manager, gd_api_instance, mocker): + """RestoreManager: valid layout triggers LDM and AM PUTs.""" + mocker.patch.object( + restore_manager.storage, "get_ws_declaration", return_value=None + ) + + def create_empty_ws(_, destination: Path): + os.mkdir(destination / DirNames.LAYOUTS) + os.mkdir(destination / DirNames.LAYOUTS / DirNames.LDM) + os.mkdir(destination / DirNames.LAYOUTS / DirNames.AM) + os.mkdir(destination / DirNames.LAYOUTS / DirNames.UDF) + os.mkdir(destination / DirNames.LAYOUTS / "filter_views") + os.mkdir(destination / DirNames.LAYOUTS / "automations") + + mocker.patch.object( + restore_manager, "_extract_zip_archive", side_effect=create_empty_ws + ) + + workspace_model = WorkspaceModel( + logical_data_model=mocker.Mock(), analytics_model=mocker.Mock() + ) + mocker.patch.object( + restore_manager, "_load_workspace_layout", return_value=workspace_model + ) + mocker.patch.object( + restore_manager, + "_load_user_data_filters", + return_value={"userDataFilters": []}, + ) + mocker.patch.object( + restore_manager, "_load_and_put_filter_views", return_value=None + ) + mocker.patch.object( + restore_manager, "_load_and_post_automations", return_value=None + ) + + restore_manager.restore( + [WorkspaceToRestore(id="ws_id", path="some/ws/path")] + ) + + gd_api_instance._sdk.catalog_workspace_content.put_declarative_ldm.assert_called_once_with( + "ws_id", workspace_model.logical_data_model + ) + gd_api_instance._sdk.catalog_workspace_content.put_declarative_analytics_model.assert_called_once_with( + "ws_id", workspace_model.analytics_model + ) + + +def test_invalid_workspace_on_disk_is_skipped( + restore_manager, gd_api_instance, mocker +): + """RestoreManager: invalid layout (missing dirs) is skipped; no PUTs.""" + mocker.patch.object( + restore_manager.storage, "get_ws_declaration", return_value=None + ) + + def create_invalid_ws(_, destination: Path): + os.mkdir(destination / DirNames.LAYOUTS) + os.mkdir(destination / DirNames.LAYOUTS / DirNames.LDM) + # Missing AM and UDF + + mocker.patch.object( + restore_manager, "_extract_zip_archive", side_effect=create_invalid_ws + ) + + restore_manager.restore( + [WorkspaceToRestore(id="ws_id", path="some/ws/path")] + ) + + gd_api_instance._sdk.catalog_workspace_content.put_declarative_ldm.assert_not_called() + gd_api_instance._sdk.catalog_workspace_content.put_declarative_analytics_model.assert_not_called() + + +def test_restore_multiple_workspaces_with_partial_failure( + restore_manager, gd_api_instance, mocker +): + """RestoreManager: multiple targets; on partial failure only successful PUTs occur.""" + ws_catalog = gd_api_instance._sdk.catalog_workspace_content + + mocker.patch.object( + restore_manager.storage, "get_ws_declaration", return_value=None + ) + + def create_valid_ws(_, destination: Path): + os.mkdir(destination / DirNames.LAYOUTS) + os.mkdir(destination / DirNames.LAYOUTS / DirNames.LDM) + os.mkdir(destination / DirNames.LAYOUTS / DirNames.AM) + os.mkdir(destination / DirNames.LAYOUTS / DirNames.UDF) + + mocker.patch.object( + restore_manager, "_extract_zip_archive", side_effect=create_valid_ws + ) + + workspace_model = WorkspaceModel( + logical_data_model=mocker.Mock(), analytics_model=mocker.Mock() + ) + # First load succeeds, second raises + mocker.patch.object( + restore_manager, + "_load_workspace_layout", + side_effect=[workspace_model, Exception()], + ) + mocker.patch.object( + restore_manager, + "_load_user_data_filters", + return_value={"userDataFilters": []}, + ) + + targets = [ + WorkspaceToRestore(id="ws_id_1", path="ws_id_1"), + WorkspaceToRestore(id="ws_id_2", path="ws_id_1"), + ] + restore_manager.restore(targets) + + ws_catalog.put_declarative_ldm.assert_any_call( + "ws_id_1", workspace_model.logical_data_model + ) + ws_catalog.put_declarative_analytics_model.assert_any_call( + "ws_id_1", workspace_model.analytics_model + ) + assert_not_called_with( + ws_catalog.put_declarative_ldm, "ws_id_2", mocker.ANY + ) + assert_not_called_with( + ws_catalog.put_declarative_analytics_model, "ws_id_2", mocker.ANY + ) + + +def test_load_user_data_filters_reads_yaml(mocker): + """RestoreManager: reads YAML UDFs into expected API body structure.""" + mocker.patch( + "gooddata_pipelines.backup_and_restore.base_manager.S3Storage", + autospec=True, + ) + gd_api_cls = mocker.patch( + "gooddata_pipelines.backup_and_restore.base_manager.GoodDataApi", + ) + gd_api_instance = gd_api_cls.return_value + gd_api_instance._sdk = mocker.MagicMock() + + config = BackupRestoreConfig( + storage_type=StorageType.S3, + storage=S3StorageConfig(backup_path=S3_BACKUP_PATH, bucket=S3_BUCKET), + ) + manager = RestoreManager.create(host="host", token="token", config=config) + + # Build a clean temp directory and copy only YAML fixtures from test data + with tempfile.TemporaryDirectory() as tempdir: + temp_root = Path(tempdir) + src_udf_dir = (TEST_UDF_ROOT / DirNames.UDF).absolute() + dst_udf_dir = temp_root / DirNames.UDF + + def ignore_non_yaml(_dir, names): + return [n for n in names if not n.endswith(".yaml")] + + shutil.copytree(src_udf_dir, dst_udf_dir, ignore=ignore_non_yaml) + + result = manager._load_user_data_filters(temp_root) + + user_data_filters_expected = { + "userDataFilters": [ + { + "id": "datafilter2", + "maql": '{label/campaign_channels.category} = "1"', + "title": "Status filter", + "user": { + "id": "5c867a8a-12af-45bf-8d85-c7d16bedebd1", + "type": "user", + }, + }, + { + "id": "datafilter4", + "maql": '{label/campaign_channels.category} = "1"', + "title": "Status filter", + "user": { + "id": "5c867a8a-12af-45bf-8d85-c7d16bedebd1", + "type": "user", + }, + }, + ] + } + + sorted_result = sorted( + json.dumps(d, sort_keys=True) for d in result["userDataFilters"] + ) + sorted_expected = sorted( + json.dumps(d, sort_keys=True) + for d in user_data_filters_expected["userDataFilters"] + ) + + assert sorted_result == sorted_expected + + +def test_manager_create_uses_s3_storage(mocker): + """RestoreManager.create: builds S3 storage when storage_type is S3.""" + storage_cls = mocker.patch( + "gooddata_pipelines.backup_and_restore.base_manager.S3Storage", + autospec=True, + ) + mocker.patch( + "gooddata_pipelines.backup_and_restore.base_manager.GoodDataApi", + ) + config = BackupRestoreConfig( + storage_type=StorageType.S3, + storage=S3StorageConfig(backup_path=S3_BACKUP_PATH, bucket=S3_BUCKET), + ) + RestoreManager.create(host="host", token="token", config=config) + storage_cls.assert_called_once() + + +def test_manager_create_from_profile(mocker): + """RestoreManager.create_from_profile: uses profile content to init GoodDataApi.""" + mocker.patch( + "gooddata_pipelines.backup_and_restore.base_manager.S3Storage", + autospec=True, + ) + gd_api_cls = mocker.patch( + "gooddata_pipelines.backup_and_restore.base_manager.GoodDataApi", + ) + mocker.patch( + "gooddata_pipelines.backup_and_restore.base_manager.profile_content", + return_value={"host": "h", "token": "t"}, + ) + + config = BackupRestoreConfig( + storage_type=StorageType.S3, + storage=S3StorageConfig(backup_path=S3_BACKUP_PATH, bucket=S3_BUCKET), + ) + RestoreManager.create_from_profile(config) + gd_api_cls.assert_called_once_with("h", "t") diff --git a/gooddata-pipelines/tests/data/restore/test_conf.yaml b/gooddata-pipelines/tests/data/restore/test_conf.yaml new file mode 100644 index 000000000..f85808a41 --- /dev/null +++ b/gooddata-pipelines/tests/data/restore/test_conf.yaml @@ -0,0 +1,6 @@ +# (C) 2025 GoodData Corporation +storage_type: s3 +storage: + bucket: some-s3-bucket + backup_path: some/s3/backup/path/org_id/ + profile: default diff --git a/gooddata-pipelines/tests/data/restore/test_udf_root/filter1.yaml b/gooddata-pipelines/tests/data/restore/test_udf_root/filter1.yaml new file mode 100644 index 000000000..780462e8d --- /dev/null +++ b/gooddata-pipelines/tests/data/restore/test_udf_root/filter1.yaml @@ -0,0 +1,7 @@ +# (C) 2025 GoodData Corporation +id: datafilter2 +title: Status filter +maql: '{label/campaign_channels.category} = "1"' +user: + id: 5c867a8a-12af-45bf-8d85-c7d16bedebd1 + type: user diff --git a/gooddata-pipelines/tests/data/restore/test_udf_root/filter2.yaml b/gooddata-pipelines/tests/data/restore/test_udf_root/filter2.yaml new file mode 100644 index 000000000..d6c671fda --- /dev/null +++ b/gooddata-pipelines/tests/data/restore/test_udf_root/filter2.yaml @@ -0,0 +1,7 @@ +# (C) 2025 GoodData Corporation +id: datafilter4 +title: Status filter +maql: '{label/campaign_channels.category} = "1"' +user: + id: 5c867a8a-12af-45bf-8d85-c7d16bedebd1 + type: user diff --git a/gooddata-pipelines/tests/data/restore/test_udf_root/user_data_filters/filter1.yaml b/gooddata-pipelines/tests/data/restore/test_udf_root/user_data_filters/filter1.yaml new file mode 100644 index 000000000..780462e8d --- /dev/null +++ b/gooddata-pipelines/tests/data/restore/test_udf_root/user_data_filters/filter1.yaml @@ -0,0 +1,7 @@ +# (C) 2025 GoodData Corporation +id: datafilter2 +title: Status filter +maql: '{label/campaign_channels.category} = "1"' +user: + id: 5c867a8a-12af-45bf-8d85-c7d16bedebd1 + type: user diff --git a/gooddata-pipelines/tests/data/restore/test_udf_root/user_data_filters/filter2.yaml b/gooddata-pipelines/tests/data/restore/test_udf_root/user_data_filters/filter2.yaml new file mode 100644 index 000000000..d6c671fda --- /dev/null +++ b/gooddata-pipelines/tests/data/restore/test_udf_root/user_data_filters/filter2.yaml @@ -0,0 +1,7 @@ +# (C) 2025 GoodData Corporation +id: datafilter4 +title: Status filter +maql: '{label/campaign_channels.category} = "1"' +user: + id: 5c867a8a-12af-45bf-8d85-c7d16bedebd1 + type: user diff --git a/gooddata-pipelines/tests/utils/test_decorators.py b/gooddata-pipelines/tests/utils/test_decorators.py new file mode 100644 index 000000000..8c92e7b1d --- /dev/null +++ b/gooddata-pipelines/tests/utils/test_decorators.py @@ -0,0 +1,40 @@ +# (C) 2025 GoodData Corporation + +from typing import Any + +import pytest + +from gooddata_pipelines.utils.decorators import log_and_reraise_exception + + +@pytest.fixture() +def mocked_logger(mocker): + return mocker.patch("gooddata_pipelines.utils.decorators.logger") + + +def test_log_and_re_raise_no_exception(mocked_logger): + """Decorator should return inner function result and not log on success.""" + + @log_and_reraise_exception("no-op") + def target(a: int, b: int, *, c: int) -> int: + return a + b + c + + result = target(1, 2, c=3) + + assert result == 6 + mocked_logger.error.assert_not_called() + + +def test_log_and_re_raise_logs_and_reraises(mocked_logger): + """Decorator should log error and re-raise the original exception.""" + + @log_and_reraise_exception("boom") + def target(*args: Any, **kwargs: Any) -> None: + raise ValueError("explosion") + + with pytest.raises(ValueError, match="explosion"): + target(1, 2, a=3) + + mocked_logger.error.assert_called_once_with( + "boom, target, Args: (1, 2), Kwargs: {'a': 3}" + ) From 163731c6e07134e7f688f1ff626d757dea879082 Mon Sep 17 00:00:00 2001 From: janmatzek Date: Tue, 7 Oct 2025 13:48:13 +0200 Subject: [PATCH 2/2] docs(gooddata-pipelines): backup and restore documentation --- .../pipelines/backup_and_restore/_index.md | 15 ++ .../pipelines/backup_and_restore/backup.md | 147 ++++++++++++++++++ .../backup_and_restore/configuration.md | 135 ++++++++++++++++ .../pipelines/backup_and_restore/restore.md | 102 ++++++++++++ 4 files changed, 399 insertions(+) create mode 100644 docs/content/en/latest/pipelines/backup_and_restore/_index.md create mode 100644 docs/content/en/latest/pipelines/backup_and_restore/backup.md create mode 100644 docs/content/en/latest/pipelines/backup_and_restore/configuration.md create mode 100644 docs/content/en/latest/pipelines/backup_and_restore/restore.md diff --git a/docs/content/en/latest/pipelines/backup_and_restore/_index.md b/docs/content/en/latest/pipelines/backup_and_restore/_index.md new file mode 100644 index 000000000..60de5cf6d --- /dev/null +++ b/docs/content/en/latest/pipelines/backup_and_restore/_index.md @@ -0,0 +1,15 @@ +--- +title: "Backup & Restore" +linkTitle: "Backup & Restore" +weight: 2 +no_list: true +--- + +The Backup & Restore module lets you create snapshots of GoodData Cloud workspaces and restore them later. It is useful for: + +- Backing up before major changes +- Migrating workspaces across environments +- Disaster recovery +- Cloning workspace configurations + +Backup and restore share common configuration objects, documented on the [Configuration](configuration/) page. For detailed, step-by-step instructions, see the [Backup](backup/) and [Restore](restore/) guides. diff --git a/docs/content/en/latest/pipelines/backup_and_restore/backup.md b/docs/content/en/latest/pipelines/backup_and_restore/backup.md new file mode 100644 index 000000000..9c7df6e49 --- /dev/null +++ b/docs/content/en/latest/pipelines/backup_and_restore/backup.md @@ -0,0 +1,147 @@ +--- +title: "Workspace Backup" +linkTitle: "Workspace Backup" +weight: 2 +--- + +Workspace Backup allows you to create backups of one or more workspaces. Backups can be stored either locally or uploaded to an S3 bucket. + +The backup stores following definitions: + +- Logical Data Model +- Analytics Model +- User Data Filters +- Filter Views +- Automations + +## Usage + +Import and initialize the BackupManager and BackupRestoreConfig from GoodData Pipelines: + +```python +from gooddata_pipelines import BackupManager, BackupRestoreConfig + +host = "http://localhost:3000" +token = "some_user_token" + +# Create your customized backup configuration or use default values +config = BackupRestoreConfig( + storage_type="local" + ) + +# Initialize the BackupManager with your configuration and GoodData Cloud credentials +backup_manager = BackupManager.create(config=config, host=host, token=token) + +# Run a backup method. For example, the `backup_entire_organization` method backs up all workspaces in GoodData Cloud. +backup_manager.backup_entire_organization() + +``` + +## Configuration + +See [Configuration](/latest/pipelines/backup_and_restore/configuration/) for details on how to set up the configuration object. + +## Backup Methods + +You can use one of these methods to back up your workspaces: + +### Back up specific workspaces + +This methods allows you to back up specific workspaces. You can supply the list of their IDs either directly or by specifying a path to a CSV file. + +#### Usage with direct input: + +```python +workspace_ids = ["workspace_1", "workspace_2", "workspace_3"] + +backup_manager.backup_workspaces(workspace_ids=workspace_ids) + +``` + +#### Usage with a csv: + +```python +path_to_csv = "path/to/local/file.csv" + +backup_manager.backup_workspaces(path_to_csv=path_to_csv) + +``` + +### Back up workspace hierarchies + +This method accepts a list of parent workspace IDs and created a backup of each workspace within their hierarchy. That includes the parent workspace and both its direct and indirect children (i.e., the children of child workspaces etc.). The IDs can be provided either directly as a list or as a path to a CSV file containing the IDs. + +#### Usage with direct input: + +```python +parent_workspace_ids = ["parent_1", "parent_2", "parent_3"] + +backup_manager.backup_hierarchies(workspace_ids=parent_workspace_ids) + +``` + +#### Usage with a csv: + +```python +path_to_csv = "path/to/local/file.csv" + +backup_manager.backup_hierarchies(path_to_csv=path_to_csv) + +``` + +### Back up entire organization + +Create a backup of all workspaces within the GoodData organization. The method requires no arguments. + +```python +backup_manager.backup_entire_organization() + +``` + +### Input CSV Format + +When using a CSV as input for backup, following format is expected: + +| **workspace_id** | +| ---------------- | +| parent_1 | +| parent_2 | +| parent_3 | + +## Example + +Here is a full example of a workspace backup process: + +```python +import logging +import os + +from gooddata_pipelines import ( + BackupManager, + BackupRestoreConfig, + S3StorageConfig, + StorageType, +) + +# Create storage configuration +s3_storage_config = S3StorageConfig.from_aws_profile( + backup_path="backup_folder", bucket="backup_bucket", profile="dev" +) + +# Create backup configuration +config = BackupRestoreConfig(storage_type=StorageType.S3, storage=s3_storage_config) + +# Initialize the BackupManager with your configuration and GoodData credentials +backup_manager = BackupManager.create( + config, os.environ["GD_HOST"], os.environ["GD_TOKEN"] +) + +# Optionally set up a logger and subscribe it to the logs from the BackupManager +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) +backup_manager.logger.subscribe(logger) + +# Run the backup +backup_manager.backup_workspaces(workspace_ids=["workspace_id_1", "workspace_id_2"]) + +``` diff --git a/docs/content/en/latest/pipelines/backup_and_restore/configuration.md b/docs/content/en/latest/pipelines/backup_and_restore/configuration.md new file mode 100644 index 000000000..84e7c3622 --- /dev/null +++ b/docs/content/en/latest/pipelines/backup_and_restore/configuration.md @@ -0,0 +1,135 @@ +--- +title: "Configuration" +linkTitle: "Configuration" +weight: 1 +--- + +The backup algorithm is configured via the `BackupRestoreConfig` class. + +## Usage + +Import `BackupRestoreConfig` from GoodData Pipelines. + +```python +from gooddata_pipelines import BackupRestoreConfig + +``` + +If you plan on storing your backups on S3, you will also need to import the `StorageType` enum and `S3StorageConfig` class. You can find more details about configuration for the S3 storage below in the [S3 Storage](#s3-storage) section. + +```python +from gooddata_pipelines import BackupRestoreConfig, S3StorageConfig, StorageType + +``` + +The `BackupRestoreConfig` accepts following parameters: + +| name | description | +| -------------------- | ------------------------------------------------------------------------------------------------------------ | +| storage_type | The type of storage to use - either `local` or `s3`. Defaults to `local`. | +| storage | Configuration for the storage type. Defaults to local storage configuration. | +| api_page_size | Page size for fetching workspace relationships. Defaults to 100 when unspecified. | +| batch_size | Configures how many workspaces are backed up in a single batch. Defaults to 100 when unspecified. | +| api_calls_per_second | Limits the maximum number of API calls to your GoodData instance. Defaults to 1. Only applied during Backup. | + +## Storage + +The configuration supports two types of storage - local and S3. + +The backups are organized in a tree with following nodes: + +- Organization ID +- Workspace ID +- Timestamped folder + +The timestamped folder will contain a `gooddata_layouts.zip` file containing the stored definitions. + +### Local Storage + +Local storage requires a single parameter - `backup_path`. It defines where the backup tree will be saved in your file system. If not defined, the script will default to creating a `local_backups` folder in current working directory and store the backups there. + +### S3 Storage + +To configure upload of the backups to S3, use the S3StorageConfig object: + +```python +from gooddata_pipelines.backup_and_restore.models.storage import S3StorageConfig + +``` + +The configuration is responsible for establishing a valid connection to S3, connecting to a bucket and specyfing the folder where the backups will be stored or read. You can create the object in three ways, depending on the type of AWS credentials you want to use. The common arguments for all three ways are: + +| name | description | +| ----------- | ------------------------------------------------------------- | +| bucket | The name of the bucket to use | +| backup_path | Path to the folder serving as the root for the backup storage | + +#### Config from IAM Role + +Will use default IAM role or environment. You only need to specify the `bucket` and `backup_path` arguments. + +```python +s3_storage_config = S3StorageConfig.from_iam_role( + backup_path="backups_folder", bucket="backup_bucket" + ) + +``` + +#### Config from AWS Profile + +Will use an existing profile to authenticate with AWS. + +```python +s3_storage_config = S3StorageConfig.from_aws_profile( + backup_path="backups_folder", bucket="backup_bucket", profile="dev" + ) + +``` + +#### Config from AWS Credentials + +Will use long lived AWS Access Keys to authenticate with AWS. + +```python +s3_storage_config = S3StorageConfig.from_aws_credentials( + backup_path="backups_folder", + bucket="backup_bucket", + aws_access_key_id="AWS_ACCESS_KEY_ID", + aws_secret_access_key="AWS_SECRET_ACCESS_KEY", + aws_default_region="us-east-1", + ) +``` + +## Examples + +Here is a couple of examples of different configuration cases. + +### Simple Local Backups + +If you want to store your backups locally and are okay with the default values, you can create the configuration object without having to specify any values: + +```python +from gooddata_pipelines import BackupRestoreConfig + +config = BackupRestoreConfig() + +``` + +### Config with S3 and AWS Profile + +If you plan to use S3, your config might look like this: + +```python +from gooddata_pipelines import ( + BackupRestoreConfig, + S3StorageConfig, + StorageType, +) + +s3_storage_config = S3StorageConfig.from_aws_profile( + backup_path="backups_folder", bucket="backup_bucket", profile="dev" + ) + +config = BackupRestoreConfig(storage_type=StorageType.S3, storage=s3_storage_config) + +``` diff --git a/docs/content/en/latest/pipelines/backup_and_restore/restore.md b/docs/content/en/latest/pipelines/backup_and_restore/restore.md new file mode 100644 index 000000000..fc4f10339 --- /dev/null +++ b/docs/content/en/latest/pipelines/backup_and_restore/restore.md @@ -0,0 +1,102 @@ +--- +title: "Workspace Restore" +linkTitle: "Workspace Restore" +weight: 3 +--- + +Workspace Restore lets you restore previously created backups of your workspaces. + +## Usage + +Start by importing the `RestoreManager` and the `BackupRestoreConfig` configuration object from GoodData Pipelines. + +```python +from gooddata_pipelines import BackupRestoreConfig, RestoreManager + +``` + +Initialize the restore manager with your configuration and GoodData credentials. If you store your backups locally, you can use the default configuration values. + +```python +config = BackupRestoreConfig() + +restore_manager = RestoreManager.create( + config, "host", "token" + ) + +``` + +You will need to define which backups should be restored. You can do this by creating a list of `WorkspaceToRestore` objects. The object carries two pieces of information - the ID of the workspace that will be restored, and the location of the backup file in the [backup tree](/latest/pipelines/backup_and_restore/configuration/#storage). + +```python +from gooddata_pipelines import WorkspaceToRestore + +workspaces_to_restore = [ + WorkspaceToRestore( + id="workspace_id_1", + path="local_backups/org_id/workspace_id_1/20251008-102252-1_52_0", + ), +] + +``` + +Now you can run the restore method of the manager. + +```python +restore_manager.restore(workspaces_to_restore=workspaces_to_restore) + +``` + +## Configuration + +See [Configuration](/latest/pipelines/backup_and_restore/configuration/) for details on how to set up the configuration object. + +## Example + +Here is a full example of a workspace restore process: + +```python +import logging +import os + +from gooddata_pipelines import ( + BackupRestoreConfig, + RestoreManager, + S3StorageConfig, + StorageType, + WorkspaceToRestore, +) + +# Create storage configuration +storage_config = S3StorageConfig.from_aws_profile( + backup_path="backup_folder", bucket="backup_bucket", profile="dev" +) + +# Create restore configuration +config = BackupRestoreConfig(storage_type=StorageType.S3, storage=storage_config) + +# Initialize the restore manager with the configuration object and GoodData credentials +restore_manager = RestoreManager.create( + config, os.environ["GD_HOST"], os.environ["GD_TOKEN"] +) + +# Optionally, set up a logger and subscribe it to RestoreManager's logs +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) +restore_manager.logger.subscribe(logger) + +# Run the restore method with a list of WorkspaceToRestore objects +restore_manager.restore( + workspaces_to_restore=[ + WorkspaceToRestore( + id="workspace_id_1", + path="org-id/workspace_id_1/20251007-144543-1_52_0", + ), + WorkspaceToRestore( + id="workspace_id_2", + path="org-id/workspace_id_2/20251007-144543-1_52_0", + ), + ] +) + +```