feat(gooddata-pipelines): Add rate militer and use it on workspace backup

benkeanna · benkeanna · commit bfe8f11f8f0e · 2025-09-09T14:47:59.000+02:00
diff --git a/gooddata-pipelines/gooddata_pipelines/backup_and_restore/backup_manager.py b/gooddata-pipelines/gooddata_pipelines/backup_and_restore/backup_manager.py
@@ -37,6 +37,7 @@
     S3Storage,
 )
 from gooddata_pipelines.logger import LogObserver
+from gooddata_pipelines.utils.rate_limiter import RateLimiter
 
 
 @dataclass
@@ -58,6 +59,10 @@ def __init__(self, host: str, token: str, config: BackupRestoreConfig):
 
         self.loader = BackupInputProcessor(self._api, self.config.api_page_size)
 
+        self._api_rate_limiter = RateLimiter(
+            calls_per_second=self.config.api_calls_per_second,
+        )
+
     @classmethod
     def create(
         cls: Type["BackupManager"],
@@ -93,11 +98,12 @@ def _get_storage(conf: BackupRestoreConfig) -> BackupStorage:
 
     def get_user_data_filters(self, ws_id: str) -> dict:
         """Returns the user data filters for the specified workspace."""
-        response: requests.Response = self._api.get_user_data_filters(ws_id)
-        if response.ok:
-            return response.json()
-        else:
-            raise RuntimeError(f"{response.status_code}: {response.text}")
+        with self._api_rate_limiter:
+            response: requests.Response = self._api.get_user_data_filters(ws_id)
+            if response.ok:
+                return response.json()
+            else:
+                raise RuntimeError(f"{response.status_code}: {response.text}")
 
     def _store_user_data_filters(
         self,
@@ -142,14 +148,17 @@ def _write_to_yaml(path: str, source: Any) -> None:
 
     def _get_automations_from_api(self, workspace_id: str) -> Any:
         """Returns automations for the workspace as JSON."""
-        response: requests.Response = self._api.get_automations(workspace_id)
-        if response.ok:
-            return response.json()
-        else:
-            raise RuntimeError(
-                f"Failed to get automations for {workspace_id}. "
-                + f"{response.status_code}: {response.text}"
+        with self._api_rate_limiter:
+            response: requests.Response = self._api.get_automations(
+                workspace_id
             )
+            if response.ok:
+                return response.json()
+            else:
+                raise RuntimeError(
+                    f"Failed to get automations for {workspace_id}. "
+                    + f"{response.status_code}: {response.text}"
+                )
 
     def _store_automations(self, export_path: Path, workspace_id: str) -> None:
         """Stores the automations in the specified export path."""
@@ -181,7 +190,8 @@ def store_declarative_filter_views(
     ) -> None:
         """Stores the filter views in the specified export path."""
         # Get the filter views YAML files from the API
-        self._api.store_declarative_filter_views(workspace_id, export_path)
+        with self._api_rate_limiter:
+            self._api.store_declarative_filter_views(workspace_id, export_path)
 
         # Move filter views to the subfolder containing the analytics model
         self._move_folder(
@@ -229,7 +239,10 @@ def _get_workspace_export(
                 # the SDK. That way we could save and package all the declarations
                 # directly instead of reorganizing the folder structures. That should
                 # be more transparent/readable and possibly safer for threading
-                self._api.store_declarative_workspace(workspace_id, export_path)
+                with self._api_rate_limiter:
+                    self._api.store_declarative_workspace(
+                        workspace_id, export_path
+                    )
                 self.store_declarative_filter_views(export_path, workspace_id)
                 self._store_automations(export_path, workspace_id)
 
diff --git a/gooddata-pipelines/gooddata_pipelines/backup_and_restore/constants.py b/gooddata-pipelines/gooddata_pipelines/backup_and_restore/constants.py
@@ -25,6 +25,7 @@ class DirNames:
 class ApiDefaults:
     DEFAULT_PAGE_SIZE = 100
     DEFAULT_BATCH_SIZE = 100
+    DEFAULT_API_CALLS_PER_SECOND = 1.0
 
 
 @dataclass(frozen=True)
diff --git a/gooddata-pipelines/gooddata_pipelines/backup_and_restore/models/storage.py b/gooddata-pipelines/gooddata_pipelines/backup_and_restore/models/storage.py
@@ -83,6 +83,13 @@ class BackupRestoreConfig(BaseModel):
             description="Batch size must be greater than 0",
         ),
     ] = Field(default=BackupSettings.DEFAULT_BATCH_SIZE)
+    api_calls_per_second: Annotated[
+        float,
+        Field(
+            gt=0,
+            description="Maximum API calls per second (rate limiting)",
+        ),
+    ] = Field(default=BackupSettings.DEFAULT_API_CALLS_PER_SECOND)
 
     @classmethod
     def from_yaml(cls, conf_path: str) -> "BackupRestoreConfig":
diff --git a/gooddata-pipelines/gooddata_pipelines/utils/__init__.py b/gooddata-pipelines/gooddata_pipelines/utils/__init__.py
@@ -0,0 +1,9 @@
+# (C) 2025 GoodData Corporation
+
+"""
+Utility modules for gooddata-pipelines package.
+"""
+
+from .rate_limiter import RateLimiter
+
+__all__ = ["RateLimiter"]
diff --git a/gooddata-pipelines/gooddata_pipelines/utils/rate_limiter.py b/gooddata-pipelines/gooddata_pipelines/utils/rate_limiter.py
@@ -0,0 +1,64 @@
+# (C) 2025 GoodData Corporation
+
+import time
+import threading
+import functools
+from typing import Callable, Any, Literal
+
+
+class RateLimiter:
+    """
+    Rate limiter usable as a decorator and as a context manager.
+      - Shared instance decorator:   limiter = RateLimiter(); @limiter
+      - Per-function decorator:      @RateLimiter(calls_per_second=2)
+      - Context manager:             with RateLimiter(2): ...
+    """
+
+    def __init__(self, calls_per_second: float = 1.0) -> None:
+        if calls_per_second <= 0:
+            raise ValueError("calls_per_second must be greater than 0")
+
+        self.calls_per_second = calls_per_second
+        self.min_interval = 1.0 / calls_per_second
+
+        self._lock = threading.Lock()
+        self._last_call_time = 0.0
+
+    def wait_if_needed(self) -> float:
+        """Sleep if needed to maintain the rate limit, return actual sleep time."""
+        with self._lock:
+            now = time.monotonic()
+            since_last = now - self._last_call_time
+
+            if since_last < self.min_interval:
+                sleep_time = self.min_interval - since_last
+                time.sleep(sleep_time)
+                self._last_call_time = time.monotonic()
+                return sleep_time
+            else:
+                self._last_call_time = now
+                return 0.0
+
+    # Decorator support
+    def __call__(self, func: Callable[..., Any]) -> Callable[..., Any]:
+        @functools.wraps(func)
+        def wrapper(*args, **kwargs):
+            self.wait_if_needed()
+            return func(*args, **kwargs)
+
+        return wrapper
+
+    # Context manager support
+    def __enter__(self) -> "RateLimiter":
+        self.wait_if_needed()
+        return self
+
+    def __exit__(
+        self, exc_type: Any, exc_val: Any, exc_tb: Any
+    ) -> Literal[False]:
+        return False
+
+    def reset(self) -> None:
+        """Reset the limiter (useful in tests)."""
+        with self._lock:
+            self._last_call_time = 0.0
diff --git a/gooddata-pipelines/tests/backup_and_restore/test_backup.py b/gooddata-pipelines/tests/backup_and_restore/test_backup.py
@@ -3,7 +3,6 @@
 import os
 import shutil
 import tempfile
-import threading
 from pathlib import Path
 from unittest import mock
 
@@ -325,7 +324,6 @@ def test_process_batch_success(
 
     backup_manager._process_batch(
         batch=batch,
-        stop_event=threading.Event(),
         retry_count=0,
     )
 
@@ -362,7 +360,6 @@ def fail_once(*args, **kwargs):
 
     backup_manager._process_batch(
         batch=batch,
-        stop_event=threading.Event(),
     )
 
     assert get_workspace_export_mock.call_count == 2
@@ -392,7 +389,6 @@ def test_process_batch_raises_after_max_retries(
     with pytest.raises(Exception) as exc_info:
         backup_manager._process_batch(
             batch=batch,
-            stop_event=threading.Event(),
             retry_count=BackupSettings.MAX_RETRIES,
         )
     assert str(exc_info.value) == "fail"
diff --git a/gooddata-pipelines/tests/utils/test_rate_limiter.py b/gooddata-pipelines/tests/utils/test_rate_limiter.py
@@ -0,0 +1,176 @@
+# (C) 2025 GoodData Corporation
+
+import time
+import pytest
+from gooddata_pipelines.utils.rate_limiter import RateLimiter
+
+
+# ---------------------------
+# Core wait + reset behavior
+# ---------------------------
+
+
+def test_rate_limiter_no_wait_needed():
+    limiter = RateLimiter(calls_per_second=1000.0)  # Very fast limit
+    waited = limiter.wait_if_needed()
+    assert waited == pytest.approx(0.0, abs=0.001)
+
+
+def test_rate_limiter_enforces_delay():
+    limiter = RateLimiter(calls_per_second=2.0)
+    limiter.wait_if_needed()
+    start = time.time()
+    waited = limiter.wait_if_needed()
+    duration = time.time() - start
+
+    assert waited >= 0.49
+    assert duration < 0.65
+
+
+def test_rate_limiter_respects_reset():
+    limiter = RateLimiter(calls_per_second=1.0)
+    limiter.wait_if_needed()
+    limiter.reset()
+    waited = limiter.wait_if_needed()
+    assert waited == pytest.approx(0.0, abs=0.001)
+
+
+def test_rate_limiter_min_interval_property():
+    limiter = RateLimiter(calls_per_second=4.0)
+    assert limiter.min_interval == pytest.approx(0.25, abs=1e-9)
+
+
+# -----------------------------------------
+# Decorator: shared instance (@limiter)
+# -----------------------------------------
+
+
+def test_rate_limiter_as_decorator_enforces_delay_shared_instance():
+    limiter = RateLimiter(calls_per_second=2.0)
+    ts = []
+
+    @limiter
+    def func():
+        ts.append(time.time())
+
+    func()
+    func()
+
+    assert len(ts) == 2
+    assert ts[1] - ts[0] >= 0.49
+
+
+def test_rate_limiter_decorator_shared_state_across_functions():
+    limiter = RateLimiter(calls_per_second=2.0)
+    ts = []
+
+    @limiter
+    def func_a():
+        ts.append(time.time())
+
+    @limiter
+    def func_b():
+        ts.append(time.time())
+
+    func_a()
+    func_b()  # should be throttled by the *same* limiter
+    assert len(ts) == 2
+    assert ts[1] - ts[0] >= 0.49
+
+
+def test_multiple_limiters_independent_state_shared_instance_mode():
+    limiter_a = RateLimiter(calls_per_second=2.0)
+    limiter_b = RateLimiter(calls_per_second=2.0)
+
+    ts_a = []
+    ts_b = []
+
+    @limiter_a
+    def func_a():
+        ts_a.append(time.time())
+
+    @limiter_b
+    def func_b():
+        ts_b.append(time.time())
+
+    func_a()
+    func_b()
+
+    # They should be ~simultaneous since they use different instances
+    assert abs(ts_a[0] - ts_b[0]) < 0.05
+
+
+# -------------------------------------------------------
+# Decorator: per-function instance (@RateLimiter(...))
+# -------------------------------------------------------
+
+
+def test_per_function_decorator_enforces_delay_per_function():
+    # Each function decorated this way gets its *own* limiter instance.
+    ts = []
+
+    @RateLimiter(calls_per_second=2.0)  # 0.5s
+    def func():
+        ts.append(time.time())
+
+    func()
+    func()
+    assert len(ts) == 2
+    assert ts[1] - ts[0] >= 0.49
+
+
+def test_per_function_decorator_independent_state_between_functions():
+    ts_a = []
+    ts_b = []
+
+    @RateLimiter(calls_per_second=2.0)
+    def func_a():
+        ts_a.append(time.time())
+
+    @RateLimiter(calls_per_second=2.0)
+    def func_b():
+        ts_b.append(time.time())
+
+    func_a()
+    func_b()  # independent limiter, so no enforced delay between A and B
+    assert abs(ts_a[0] - ts_b[0]) < 0.05
+
+
+# -----------------------------
+# Context manager usage
+# -----------------------------
+
+
+def test_context_manager_waits_on_enter():
+    limiter = RateLimiter(calls_per_second=2.0)
+    with limiter:
+        t1 = time.time()
+    with limiter:
+        t2 = time.time()
+
+    # The second 'with' should be at least 0.5s after the first 'with' enter
+    assert t2 - t1 >= 0.49
+
+
+def test_context_manager_multiple_uses_same_instance():
+    limiter = RateLimiter(calls_per_second=3.0)
+    times = []
+
+    for _ in range(3):
+        with limiter:
+            times.append(time.time())
+
+    intervals = [b - a for a, b in zip(times, times[1:])]
+    for iv in intervals:
+        assert iv >= 0.30  # a bit of slack for timing variance
+
+
+def test_context_manager_propagates_exceptions():
+    limiter = RateLimiter(calls_per_second=10.0)
+
+    class Boom(Exception):
+        pass
+
+    with pytest.raises(Boom):
+        with limiter:
+            raise Boom("fail")