From 845167af5d078618bda22af96c7160d5f7fec274 Mon Sep 17 00:00:00 2001
From: Jan Kadlec <kadlino98@seznam.cz>
Date: Mon, 20 Jan 2025 08:48:03 +0100
Subject: [PATCH] perf: smart catalog attributes fetching

Improves fetching for catalog attributes. The previous implementation used threshold to prevent `414 Request-URI Too Large`. If the threshold was reached the whole catalog was fetched. The new implementation calls get_attributes_catalog multiple times with different rsql_filter.

JIRA: STL-1036
risk: low
---
 .../gooddata_pandas/data_access.py            |  10 +-
 gooddata-pandas/gooddata_pandas/utils.py      |  21 ++
 gooddata-pandas/tests/utils/__init__.py       |   1 +
 ...st_get_catalog_attributes_for_extract.yaml | 266 ++++++++++++++++++
 gooddata-pandas/tests/utils/test_utils.py     |  24 ++
 gooddata-sdk/gooddata_sdk/utils.py            |  26 +-
 6 files changed, 335 insertions(+), 13 deletions(-)
 create mode 100644 gooddata-pandas/tests/utils/__init__.py
 create mode 100644 gooddata-pandas/tests/utils/fixtures/test_get_catalog_attributes_for_extract.yaml
 create mode 100644 gooddata-pandas/tests/utils/test_utils.py

diff --git a/gooddata-pandas/gooddata_pandas/data_access.py b/gooddata-pandas/gooddata_pandas/data_access.py
index 25a63dad1..39a3bae62 100644
--- a/gooddata-pandas/gooddata_pandas/data_access.py
+++ b/gooddata-pandas/gooddata_pandas/data_access.py
@@ -16,7 +16,7 @@
     ObjId,
     TableDimension,
 )
-from gooddata_sdk.utils import IdObjType, filter_for_attributes_labels
+from gooddata_sdk.utils import IdObjType
 
 from gooddata_pandas.utils import (
     ColumnsDef,
@@ -26,6 +26,7 @@
     _to_attribute,
     _to_item,
     _typed_attribute_value,
+    get_catalog_attributes_for_extract,
 )
 
 
@@ -446,12 +447,7 @@ def compute_and_extract(
     if not exec_def.has_attributes():
         return _extract_for_metrics_only(response, cols, col_to_metric_idx), dict()
     else:
-        filter_query = filter_for_attributes_labels(exec_def.attributes)
-        # if there is to many labels then all attributes are fetched and no rsql filter is used
-        # it prevention again 414 Request-URI Too Long
-        attributes = sdk.catalog_workspace_content.get_attributes_catalog(
-            workspace_id, include=["labels", "datasets"], rsql_filter=filter_query
-        )
+        attributes = get_catalog_attributes_for_extract(sdk, workspace_id, exec_def.attributes)
         return _extract_from_attributes_and_maybe_metrics(
             response,
             attributes,
diff --git a/gooddata-pandas/gooddata_pandas/utils.py b/gooddata-pandas/gooddata_pandas/utils.py
index 167678c9f..fb8fd8b16 100644
--- a/gooddata-pandas/gooddata_pandas/utils.py
+++ b/gooddata-pandas/gooddata_pandas/utils.py
@@ -9,6 +9,7 @@
 from gooddata_sdk import (
     Attribute,
     CatalogAttribute,
+    GoodDataSdk,
     Metric,
     ObjId,
     SimpleMetric,
@@ -16,6 +17,7 @@
     VisualizationMetric,
 )
 from gooddata_sdk.type_converter import AttributeConverterStore, DateConverter, DatetimeConverter, IntegerConverter
+from gooddata_sdk.utils import filter_for_attributes_labels
 from pandas import Index, MultiIndex
 
 LabelItemDef = Union[Attribute, ObjId, str]
@@ -29,6 +31,25 @@
 DatetimeConverter.set_external_fnc(lambda self, value: pandas.to_datetime(value))
 
 
+def get_catalog_attributes_for_extract(
+    sdk: GoodDataSdk, workspace_id: str, attributes: list[Attribute], character_limit: int = 1500
+) -> list[CatalogAttribute]:
+    """
+    Get catalog attributes for the given attributes.
+    It uses the filter_for_attributes_labels function to get the
+    RSQL queries for the attributes and then fetches the catalog attributes for the given workspace.
+    This approach prevents loading all catalog attributes providing significant speed-up.
+    """
+    rsql_queries = filter_for_attributes_labels(attributes, character_limit)
+    return [
+        attr
+        for query in rsql_queries
+        for attr in sdk.catalog_workspace_content.get_attributes_catalog(
+            workspace_id, include=["labels", "datasets"], rsql_filter=query
+        )
+    ]
+
+
 def _unique_local_id() -> str:
     """
     Generate unique local ID of a DataItem without dashes.
diff --git a/gooddata-pandas/tests/utils/__init__.py b/gooddata-pandas/tests/utils/__init__.py
new file mode 100644
index 000000000..37d863d60
--- /dev/null
+++ b/gooddata-pandas/tests/utils/__init__.py
@@ -0,0 +1 @@
+# (C) 2025 GoodData Corporation
diff --git a/gooddata-pandas/tests/utils/fixtures/test_get_catalog_attributes_for_extract.yaml b/gooddata-pandas/tests/utils/fixtures/test_get_catalog_attributes_for_extract.yaml
new file mode 100644
index 000000000..1a9a924a2
--- /dev/null
+++ b/gooddata-pandas/tests/utils/fixtures/test_get_catalog_attributes_for_extract.yaml
@@ -0,0 +1,266 @@
+# (C) 2025 GoodData Corporation
+version: 1
+interactions:
+  - request:
+      method: GET
+      uri: http://localhost:3000/api/v1/entities/workspaces/demo/attributes?include=labels%2Cdatasets&filter=labels.id%3Din%3D%28campaign_name%29&page=0&size=500
+      body: null
+      headers:
+        Accept:
+          - application/vnd.gooddata.api+json
+        Accept-Encoding:
+          - br, gzip, deflate
+        X-GDC-VALIDATE-RELATIONS:
+          - 'true'
+        X-Requested-With:
+          - XMLHttpRequest
+    response:
+      status:
+        code: 200
+        message: OK
+      headers:
+        Access-Control-Allow-Credentials:
+          - 'true'
+        Access-Control-Expose-Headers:
+          - Content-Disposition, Content-Length, Content-Range, Set-Cookie
+        Cache-Control:
+          - no-cache, no-store, max-age=0, must-revalidate
+        Connection:
+          - keep-alive
+        Content-Length:
+          - '1541'
+        Content-Security-Policy:
+          - 'default-src ''self'' *.wistia.com *.wistia.net; script-src ''self'' ''unsafe-inline''
+            ''unsafe-eval'' *.wistia.com *.wistia.net *.hsforms.net *.hsforms.com
+            src.litix.io matomo.anywhere.gooddata.com *.jquery.com unpkg.com cdnjs.cloudflare.com;
+            img-src * data: blob:; style-src ''self'' ''unsafe-inline'' fonts.googleapis.com
+            cdn.jsdelivr.net fast.fonts.net; font-src ''self'' data: fonts.gstatic.com
+            *.alicdn.com *.wistia.com cdn.jsdelivr.net info.gooddata.com; frame-src
+            ''self'' *.hsforms.net *.hsforms.com; object-src ''none''; worker-src
+            ''self'' blob:; child-src blob:; connect-src ''self'' *.tiles.mapbox.com
+            *.mapbox.com *.litix.io *.wistia.com *.hsforms.net *.hsforms.com embedwistia-a.akamaihd.net
+            matomo.anywhere.gooddata.com; media-src ''self'' blob: data: *.wistia.com
+            *.wistia.net embedwistia-a.akamaihd.net'
+        Content-Type:
+          - application/vnd.gooddata.api+json
+        DATE: &id001
+          - PLACEHOLDER
+        Expires:
+          - '0'
+        GoodData-Deployment:
+          - aio
+        Permission-Policy:
+          - geolocation 'none'; midi 'none'; sync-xhr 'none'; microphone 'none'; camera
+            'none'; magnetometer 'none'; gyroscope 'none'; fullscreen 'none'; payment
+            'none';
+        Pragma:
+          - no-cache
+        Referrer-Policy:
+          - no-referrer
+        Server:
+          - nginx
+        Vary:
+          - Origin
+          - Access-Control-Request-Method
+          - Access-Control-Request-Headers
+        X-Content-Type-Options:
+          - nosniff
+        X-GDC-TRACE-ID: *id001
+        X-XSS-Protection:
+          - '0'
+        set-cookie:
+          - SPRING_REDIRECT_URI=; Max-Age=0; Expires=Mon, 20 Jan 2025 10:18:23 GMT;
+            Path=/; HTTPOnly; SameSite=Lax
+      body:
+        string:
+          data:
+            - id: campaign_name
+              type: attribute
+              attributes:
+                title: Campaign name
+                description: Campaign name
+                tags:
+                  - Campaigns
+                areRelationsValid: true
+                sourceColumn: campaign_name
+                sourceColumnDataType: STRING
+              relationships:
+                dataset:
+                  data:
+                    id: campaigns
+                    type: dataset
+                labels:
+                  data:
+                    - id: campaign_name
+                      type: label
+              links:
+                self: http://localhost:3000/api/v1/entities/workspaces/demo/attributes/campaign_name
+              meta:
+                origin:
+                  originType: NATIVE
+                  originId: demo
+          included:
+            - id: campaigns
+              type: dataset
+              attributes:
+                title: Campaigns
+                description: Campaigns
+                tags:
+                  - Campaigns
+                grain:
+                  - id: campaign_id
+                    type: attribute
+                dataSourceTableId: demo-test-ds:campaigns
+                dataSourceTablePath:
+                  - demo
+                  - campaigns
+                type: NORMAL
+              links:
+                self: http://localhost:3000/api/v1/entities/workspaces/demo/datasets/campaigns
+            - id: campaign_name
+              type: label
+              attributes:
+                title: Campaign name
+                description: Campaign name
+                tags:
+                  - Campaigns
+                primary: true
+                sourceColumn: campaign_name
+                sourceColumnDataType: STRING
+                valueType: TEXT
+              links:
+                self: http://localhost:3000/api/v1/entities/workspaces/demo/labels/campaign_name
+          links:
+            self: http://localhost:3000/api/v1/entities/workspaces/demo/attributes?include=labels%2Cdatasets&filter=labels.id%3D%3D%27campaign_name%27&page=0&size=500
+            next: http://localhost:3000/api/v1/entities/workspaces/demo/attributes?include=labels%2Cdatasets&filter=labels.id%3D%3D%27campaign_name%27&page=1&size=500
+  - request:
+      method: GET
+      uri: http://localhost:3000/api/v1/entities/workspaces/demo/attributes?include=labels%2Cdatasets&filter=labels.id%3Din%3D%28region%29&page=0&size=500
+      body: null
+      headers:
+        Accept:
+          - application/vnd.gooddata.api+json
+        Accept-Encoding:
+          - br, gzip, deflate
+        X-GDC-VALIDATE-RELATIONS:
+          - 'true'
+        X-Requested-With:
+          - XMLHttpRequest
+    response:
+      status:
+        code: 200
+        message: OK
+      headers:
+        Access-Control-Allow-Credentials:
+          - 'true'
+        Access-Control-Expose-Headers:
+          - Content-Disposition, Content-Length, Content-Range, Set-Cookie
+        Cache-Control:
+          - no-cache, no-store, max-age=0, must-revalidate
+        Connection:
+          - keep-alive
+        Content-Length:
+          - '1450'
+        Content-Security-Policy:
+          - 'default-src ''self'' *.wistia.com *.wistia.net; script-src ''self'' ''unsafe-inline''
+            ''unsafe-eval'' *.wistia.com *.wistia.net *.hsforms.net *.hsforms.com
+            src.litix.io matomo.anywhere.gooddata.com *.jquery.com unpkg.com cdnjs.cloudflare.com;
+            img-src * data: blob:; style-src ''self'' ''unsafe-inline'' fonts.googleapis.com
+            cdn.jsdelivr.net fast.fonts.net; font-src ''self'' data: fonts.gstatic.com
+            *.alicdn.com *.wistia.com cdn.jsdelivr.net info.gooddata.com; frame-src
+            ''self'' *.hsforms.net *.hsforms.com; object-src ''none''; worker-src
+            ''self'' blob:; child-src blob:; connect-src ''self'' *.tiles.mapbox.com
+            *.mapbox.com *.litix.io *.wistia.com *.hsforms.net *.hsforms.com embedwistia-a.akamaihd.net
+            matomo.anywhere.gooddata.com; media-src ''self'' blob: data: *.wistia.com
+            *.wistia.net embedwistia-a.akamaihd.net'
+        Content-Type:
+          - application/vnd.gooddata.api+json
+        DATE: *id001
+        Expires:
+          - '0'
+        GoodData-Deployment:
+          - aio
+        Permission-Policy:
+          - geolocation 'none'; midi 'none'; sync-xhr 'none'; microphone 'none'; camera
+            'none'; magnetometer 'none'; gyroscope 'none'; fullscreen 'none'; payment
+            'none';
+        Pragma:
+          - no-cache
+        Referrer-Policy:
+          - no-referrer
+        Server:
+          - nginx
+        Vary:
+          - Origin
+          - Access-Control-Request-Method
+          - Access-Control-Request-Headers
+        X-Content-Type-Options:
+          - nosniff
+        X-GDC-TRACE-ID: *id001
+        X-XSS-Protection:
+          - '0'
+        set-cookie:
+          - SPRING_REDIRECT_URI=; Max-Age=0; Expires=Mon, 20 Jan 2025 10:18:23 GMT;
+            Path=/; HTTPOnly; SameSite=Lax
+      body:
+        string:
+          data:
+            - id: region
+              type: attribute
+              attributes:
+                title: Region
+                description: Region
+                tags:
+                  - Customers
+                areRelationsValid: true
+                sourceColumn: region
+                sourceColumnDataType: STRING
+              relationships:
+                dataset:
+                  data:
+                    id: customers
+                    type: dataset
+                labels:
+                  data:
+                    - id: region
+                      type: label
+              links:
+                self: http://localhost:3000/api/v1/entities/workspaces/demo/attributes/region
+              meta:
+                origin:
+                  originType: NATIVE
+                  originId: demo
+          included:
+            - id: customers
+              type: dataset
+              attributes:
+                title: Customers
+                description: Customers
+                tags:
+                  - Customers
+                grain:
+                  - id: customer_id
+                    type: attribute
+                dataSourceTableId: demo-test-ds:customers
+                dataSourceTablePath:
+                  - demo
+                  - customers
+                type: NORMAL
+              links:
+                self: http://localhost:3000/api/v1/entities/workspaces/demo/datasets/customers
+            - id: region
+              type: label
+              attributes:
+                title: Region
+                description: Region
+                tags:
+                  - Customers
+                primary: true
+                sourceColumn: region
+                sourceColumnDataType: STRING
+                valueType: TEXT
+              links:
+                self: http://localhost:3000/api/v1/entities/workspaces/demo/labels/region
+          links:
+            self: http://localhost:3000/api/v1/entities/workspaces/demo/attributes?include=labels%2Cdatasets&filter=labels.id%3D%3D%27region%27&page=0&size=500
+            next: http://localhost:3000/api/v1/entities/workspaces/demo/attributes?include=labels%2Cdatasets&filter=labels.id%3D%3D%27region%27&page=1&size=500
diff --git a/gooddata-pandas/tests/utils/test_utils.py b/gooddata-pandas/tests/utils/test_utils.py
new file mode 100644
index 000000000..1d811f6f5
--- /dev/null
+++ b/gooddata-pandas/tests/utils/test_utils.py
@@ -0,0 +1,24 @@
+# (C) 2025 GoodData Corporation
+from pathlib import Path
+
+from gooddata_pandas.utils import get_catalog_attributes_for_extract
+from gooddata_sdk import (
+    Attribute,
+    GoodDataSdk,
+)
+from tests_support.vcrpy_utils import get_vcr
+
+gd_vcr = get_vcr()
+
+_current_dir = Path(__file__).parent.absolute()
+_fixtures_dir = _current_dir / "fixtures"
+
+
+@gd_vcr.use_cassette(str(_fixtures_dir / "test_get_catalog_attributes_for_extract.yaml"))
+def test_get_catalog_attributes_for_extract(test_config):
+    sdk = GoodDataSdk.create(host_=test_config["host"], token_=test_config["token"])
+    workspace_id = "demo"
+    attributes = [Attribute(local_id="0", label="campaign_name"), Attribute(local_id="1", label="region")]
+    catalog_attributes = get_catalog_attributes_for_extract(sdk, workspace_id, attributes, character_limit=28)
+    assert len(catalog_attributes) == 2
+    assert [ca.id for ca in catalog_attributes] == ["campaign_name", "region"]
diff --git a/gooddata-sdk/gooddata_sdk/utils.py b/gooddata-sdk/gooddata_sdk/utils.py
index 60acdb9ac..17bf59045 100644
--- a/gooddata-sdk/gooddata_sdk/utils.py
+++ b/gooddata-sdk/gooddata_sdk/utils.py
@@ -9,7 +9,7 @@
 from enum import Enum, auto
 from pathlib import Path
 from shutil import rmtree
-from typing import Any, Callable, NamedTuple, Optional, Union, cast, no_type_check
+from typing import Any, Callable, NamedTuple, Union, cast, no_type_check
 from warnings import warn
 from xml.etree import ElementTree as ET
 
@@ -427,13 +427,27 @@ def ref_extract(ref: dict[str, Any]) -> Union[str, ObjId]:
     raise ValueError("invalid ref. must be identifier or localIdentifier")
 
 
-def filter_for_attributes_labels(attributes: list[Attribute], character_limit: int = 1500) -> Optional[str]:
+def filter_for_attributes_labels(attributes: list[Attribute], character_limit: int = 1500) -> list[str]:
     """
     Character limit is to prevent 414 Request-URI Too Large error from server.
     """
     # set(...) does not work deterministically; therefore, it is necessary to use dict.fromkeys
     label_ids = dict.fromkeys([attribute.label.id for attribute in attributes])
-    rsql_query = f"labels.id=in=({','.join(label_ids)})"
-    if len(rsql_query) < character_limit:
-        return rsql_query
-    return None
+
+    longest_id = max(map(len, label_ids))
+    assert character_limit >= len("labels.id=in=()") + longest_id, (
+        f"Character limit must be at least {len('labels.id=in=()') + longest_id}"
+    )
+    queries = []
+    current_batch: list[str] = []
+
+    for label_id in label_ids:
+        if len(f"labels.id=in=({','.join(current_batch + [label_id])})") <= character_limit:
+            current_batch.append(label_id)
+        else:
+            queries.append(f"labels.id=in=({','.join(current_batch)})")
+            current_batch = [label_id]
+
+    if current_batch:  # Add remaining batch
+        queries.append(f"labels.id=in=({','.join(current_batch)})")
+    return queries