From 845167af5d078618bda22af96c7160d5f7fec274 Mon Sep 17 00:00:00 2001 From: Jan Kadlec Date: Mon, 20 Jan 2025 08:48:03 +0100 Subject: [PATCH] perf: smart catalog attributes fetching Improves fetching for catalog attributes. The previous implementation used threshold to prevent `414 Request-URI Too Large`. If the threshold was reached the whole catalog was fetched. The new implementation calls get_attributes_catalog multiple times with different rsql_filter. JIRA: STL-1036 risk: low --- .../gooddata_pandas/data_access.py | 10 +- gooddata-pandas/gooddata_pandas/utils.py | 21 ++ gooddata-pandas/tests/utils/__init__.py | 1 + ...st_get_catalog_attributes_for_extract.yaml | 266 ++++++++++++++++++ gooddata-pandas/tests/utils/test_utils.py | 24 ++ gooddata-sdk/gooddata_sdk/utils.py | 26 +- 6 files changed, 335 insertions(+), 13 deletions(-) create mode 100644 gooddata-pandas/tests/utils/__init__.py create mode 100644 gooddata-pandas/tests/utils/fixtures/test_get_catalog_attributes_for_extract.yaml create mode 100644 gooddata-pandas/tests/utils/test_utils.py diff --git a/gooddata-pandas/gooddata_pandas/data_access.py b/gooddata-pandas/gooddata_pandas/data_access.py index 25a63dad1..39a3bae62 100644 --- a/gooddata-pandas/gooddata_pandas/data_access.py +++ b/gooddata-pandas/gooddata_pandas/data_access.py @@ -16,7 +16,7 @@ ObjId, TableDimension, ) -from gooddata_sdk.utils import IdObjType, filter_for_attributes_labels +from gooddata_sdk.utils import IdObjType from gooddata_pandas.utils import ( ColumnsDef, @@ -26,6 +26,7 @@ _to_attribute, _to_item, _typed_attribute_value, + get_catalog_attributes_for_extract, ) @@ -446,12 +447,7 @@ def compute_and_extract( if not exec_def.has_attributes(): return _extract_for_metrics_only(response, cols, col_to_metric_idx), dict() else: - filter_query = filter_for_attributes_labels(exec_def.attributes) - # if there is to many labels then all attributes are fetched and no rsql filter is used - # it prevention again 414 Request-URI Too Long - attributes = sdk.catalog_workspace_content.get_attributes_catalog( - workspace_id, include=["labels", "datasets"], rsql_filter=filter_query - ) + attributes = get_catalog_attributes_for_extract(sdk, workspace_id, exec_def.attributes) return _extract_from_attributes_and_maybe_metrics( response, attributes, diff --git a/gooddata-pandas/gooddata_pandas/utils.py b/gooddata-pandas/gooddata_pandas/utils.py index 167678c9f..fb8fd8b16 100644 --- a/gooddata-pandas/gooddata_pandas/utils.py +++ b/gooddata-pandas/gooddata_pandas/utils.py @@ -9,6 +9,7 @@ from gooddata_sdk import ( Attribute, CatalogAttribute, + GoodDataSdk, Metric, ObjId, SimpleMetric, @@ -16,6 +17,7 @@ VisualizationMetric, ) from gooddata_sdk.type_converter import AttributeConverterStore, DateConverter, DatetimeConverter, IntegerConverter +from gooddata_sdk.utils import filter_for_attributes_labels from pandas import Index, MultiIndex LabelItemDef = Union[Attribute, ObjId, str] @@ -29,6 +31,25 @@ DatetimeConverter.set_external_fnc(lambda self, value: pandas.to_datetime(value)) +def get_catalog_attributes_for_extract( + sdk: GoodDataSdk, workspace_id: str, attributes: list[Attribute], character_limit: int = 1500 +) -> list[CatalogAttribute]: + """ + Get catalog attributes for the given attributes. + It uses the filter_for_attributes_labels function to get the + RSQL queries for the attributes and then fetches the catalog attributes for the given workspace. + This approach prevents loading all catalog attributes providing significant speed-up. + """ + rsql_queries = filter_for_attributes_labels(attributes, character_limit) + return [ + attr + for query in rsql_queries + for attr in sdk.catalog_workspace_content.get_attributes_catalog( + workspace_id, include=["labels", "datasets"], rsql_filter=query + ) + ] + + def _unique_local_id() -> str: """ Generate unique local ID of a DataItem without dashes. diff --git a/gooddata-pandas/tests/utils/__init__.py b/gooddata-pandas/tests/utils/__init__.py new file mode 100644 index 000000000..37d863d60 --- /dev/null +++ b/gooddata-pandas/tests/utils/__init__.py @@ -0,0 +1 @@ +# (C) 2025 GoodData Corporation diff --git a/gooddata-pandas/tests/utils/fixtures/test_get_catalog_attributes_for_extract.yaml b/gooddata-pandas/tests/utils/fixtures/test_get_catalog_attributes_for_extract.yaml new file mode 100644 index 000000000..1a9a924a2 --- /dev/null +++ b/gooddata-pandas/tests/utils/fixtures/test_get_catalog_attributes_for_extract.yaml @@ -0,0 +1,266 @@ +# (C) 2025 GoodData Corporation +version: 1 +interactions: + - request: + method: GET + uri: http://localhost:3000/api/v1/entities/workspaces/demo/attributes?include=labels%2Cdatasets&filter=labels.id%3Din%3D%28campaign_name%29&page=0&size=500 + body: null + headers: + Accept: + - application/vnd.gooddata.api+json + Accept-Encoding: + - br, gzip, deflate + X-GDC-VALIDATE-RELATIONS: + - 'true' + X-Requested-With: + - XMLHttpRequest + response: + status: + code: 200 + message: OK + headers: + Access-Control-Allow-Credentials: + - 'true' + Access-Control-Expose-Headers: + - Content-Disposition, Content-Length, Content-Range, Set-Cookie + Cache-Control: + - no-cache, no-store, max-age=0, must-revalidate + Connection: + - keep-alive + Content-Length: + - '1541' + Content-Security-Policy: + - 'default-src ''self'' *.wistia.com *.wistia.net; script-src ''self'' ''unsafe-inline'' + ''unsafe-eval'' *.wistia.com *.wistia.net *.hsforms.net *.hsforms.com + src.litix.io matomo.anywhere.gooddata.com *.jquery.com unpkg.com cdnjs.cloudflare.com; + img-src * data: blob:; style-src ''self'' ''unsafe-inline'' fonts.googleapis.com + cdn.jsdelivr.net fast.fonts.net; font-src ''self'' data: fonts.gstatic.com + *.alicdn.com *.wistia.com cdn.jsdelivr.net info.gooddata.com; frame-src + ''self'' *.hsforms.net *.hsforms.com; object-src ''none''; worker-src + ''self'' blob:; child-src blob:; connect-src ''self'' *.tiles.mapbox.com + *.mapbox.com *.litix.io *.wistia.com *.hsforms.net *.hsforms.com embedwistia-a.akamaihd.net + matomo.anywhere.gooddata.com; media-src ''self'' blob: data: *.wistia.com + *.wistia.net embedwistia-a.akamaihd.net' + Content-Type: + - application/vnd.gooddata.api+json + DATE: &id001 + - PLACEHOLDER + Expires: + - '0' + GoodData-Deployment: + - aio + Permission-Policy: + - geolocation 'none'; midi 'none'; sync-xhr 'none'; microphone 'none'; camera + 'none'; magnetometer 'none'; gyroscope 'none'; fullscreen 'none'; payment + 'none'; + Pragma: + - no-cache + Referrer-Policy: + - no-referrer + Server: + - nginx + Vary: + - Origin + - Access-Control-Request-Method + - Access-Control-Request-Headers + X-Content-Type-Options: + - nosniff + X-GDC-TRACE-ID: *id001 + X-XSS-Protection: + - '0' + set-cookie: + - SPRING_REDIRECT_URI=; Max-Age=0; Expires=Mon, 20 Jan 2025 10:18:23 GMT; + Path=/; HTTPOnly; SameSite=Lax + body: + string: + data: + - id: campaign_name + type: attribute + attributes: + title: Campaign name + description: Campaign name + tags: + - Campaigns + areRelationsValid: true + sourceColumn: campaign_name + sourceColumnDataType: STRING + relationships: + dataset: + data: + id: campaigns + type: dataset + labels: + data: + - id: campaign_name + type: label + links: + self: http://localhost:3000/api/v1/entities/workspaces/demo/attributes/campaign_name + meta: + origin: + originType: NATIVE + originId: demo + included: + - id: campaigns + type: dataset + attributes: + title: Campaigns + description: Campaigns + tags: + - Campaigns + grain: + - id: campaign_id + type: attribute + dataSourceTableId: demo-test-ds:campaigns + dataSourceTablePath: + - demo + - campaigns + type: NORMAL + links: + self: http://localhost:3000/api/v1/entities/workspaces/demo/datasets/campaigns + - id: campaign_name + type: label + attributes: + title: Campaign name + description: Campaign name + tags: + - Campaigns + primary: true + sourceColumn: campaign_name + sourceColumnDataType: STRING + valueType: TEXT + links: + self: http://localhost:3000/api/v1/entities/workspaces/demo/labels/campaign_name + links: + self: http://localhost:3000/api/v1/entities/workspaces/demo/attributes?include=labels%2Cdatasets&filter=labels.id%3D%3D%27campaign_name%27&page=0&size=500 + next: http://localhost:3000/api/v1/entities/workspaces/demo/attributes?include=labels%2Cdatasets&filter=labels.id%3D%3D%27campaign_name%27&page=1&size=500 + - request: + method: GET + uri: http://localhost:3000/api/v1/entities/workspaces/demo/attributes?include=labels%2Cdatasets&filter=labels.id%3Din%3D%28region%29&page=0&size=500 + body: null + headers: + Accept: + - application/vnd.gooddata.api+json + Accept-Encoding: + - br, gzip, deflate + X-GDC-VALIDATE-RELATIONS: + - 'true' + X-Requested-With: + - XMLHttpRequest + response: + status: + code: 200 + message: OK + headers: + Access-Control-Allow-Credentials: + - 'true' + Access-Control-Expose-Headers: + - Content-Disposition, Content-Length, Content-Range, Set-Cookie + Cache-Control: + - no-cache, no-store, max-age=0, must-revalidate + Connection: + - keep-alive + Content-Length: + - '1450' + Content-Security-Policy: + - 'default-src ''self'' *.wistia.com *.wistia.net; script-src ''self'' ''unsafe-inline'' + ''unsafe-eval'' *.wistia.com *.wistia.net *.hsforms.net *.hsforms.com + src.litix.io matomo.anywhere.gooddata.com *.jquery.com unpkg.com cdnjs.cloudflare.com; + img-src * data: blob:; style-src ''self'' ''unsafe-inline'' fonts.googleapis.com + cdn.jsdelivr.net fast.fonts.net; font-src ''self'' data: fonts.gstatic.com + *.alicdn.com *.wistia.com cdn.jsdelivr.net info.gooddata.com; frame-src + ''self'' *.hsforms.net *.hsforms.com; object-src ''none''; worker-src + ''self'' blob:; child-src blob:; connect-src ''self'' *.tiles.mapbox.com + *.mapbox.com *.litix.io *.wistia.com *.hsforms.net *.hsforms.com embedwistia-a.akamaihd.net + matomo.anywhere.gooddata.com; media-src ''self'' blob: data: *.wistia.com + *.wistia.net embedwistia-a.akamaihd.net' + Content-Type: + - application/vnd.gooddata.api+json + DATE: *id001 + Expires: + - '0' + GoodData-Deployment: + - aio + Permission-Policy: + - geolocation 'none'; midi 'none'; sync-xhr 'none'; microphone 'none'; camera + 'none'; magnetometer 'none'; gyroscope 'none'; fullscreen 'none'; payment + 'none'; + Pragma: + - no-cache + Referrer-Policy: + - no-referrer + Server: + - nginx + Vary: + - Origin + - Access-Control-Request-Method + - Access-Control-Request-Headers + X-Content-Type-Options: + - nosniff + X-GDC-TRACE-ID: *id001 + X-XSS-Protection: + - '0' + set-cookie: + - SPRING_REDIRECT_URI=; Max-Age=0; Expires=Mon, 20 Jan 2025 10:18:23 GMT; + Path=/; HTTPOnly; SameSite=Lax + body: + string: + data: + - id: region + type: attribute + attributes: + title: Region + description: Region + tags: + - Customers + areRelationsValid: true + sourceColumn: region + sourceColumnDataType: STRING + relationships: + dataset: + data: + id: customers + type: dataset + labels: + data: + - id: region + type: label + links: + self: http://localhost:3000/api/v1/entities/workspaces/demo/attributes/region + meta: + origin: + originType: NATIVE + originId: demo + included: + - id: customers + type: dataset + attributes: + title: Customers + description: Customers + tags: + - Customers + grain: + - id: customer_id + type: attribute + dataSourceTableId: demo-test-ds:customers + dataSourceTablePath: + - demo + - customers + type: NORMAL + links: + self: http://localhost:3000/api/v1/entities/workspaces/demo/datasets/customers + - id: region + type: label + attributes: + title: Region + description: Region + tags: + - Customers + primary: true + sourceColumn: region + sourceColumnDataType: STRING + valueType: TEXT + links: + self: http://localhost:3000/api/v1/entities/workspaces/demo/labels/region + links: + self: http://localhost:3000/api/v1/entities/workspaces/demo/attributes?include=labels%2Cdatasets&filter=labels.id%3D%3D%27region%27&page=0&size=500 + next: http://localhost:3000/api/v1/entities/workspaces/demo/attributes?include=labels%2Cdatasets&filter=labels.id%3D%3D%27region%27&page=1&size=500 diff --git a/gooddata-pandas/tests/utils/test_utils.py b/gooddata-pandas/tests/utils/test_utils.py new file mode 100644 index 000000000..1d811f6f5 --- /dev/null +++ b/gooddata-pandas/tests/utils/test_utils.py @@ -0,0 +1,24 @@ +# (C) 2025 GoodData Corporation +from pathlib import Path + +from gooddata_pandas.utils import get_catalog_attributes_for_extract +from gooddata_sdk import ( + Attribute, + GoodDataSdk, +) +from tests_support.vcrpy_utils import get_vcr + +gd_vcr = get_vcr() + +_current_dir = Path(__file__).parent.absolute() +_fixtures_dir = _current_dir / "fixtures" + + +@gd_vcr.use_cassette(str(_fixtures_dir / "test_get_catalog_attributes_for_extract.yaml")) +def test_get_catalog_attributes_for_extract(test_config): + sdk = GoodDataSdk.create(host_=test_config["host"], token_=test_config["token"]) + workspace_id = "demo" + attributes = [Attribute(local_id="0", label="campaign_name"), Attribute(local_id="1", label="region")] + catalog_attributes = get_catalog_attributes_for_extract(sdk, workspace_id, attributes, character_limit=28) + assert len(catalog_attributes) == 2 + assert [ca.id for ca in catalog_attributes] == ["campaign_name", "region"] diff --git a/gooddata-sdk/gooddata_sdk/utils.py b/gooddata-sdk/gooddata_sdk/utils.py index 60acdb9ac..17bf59045 100644 --- a/gooddata-sdk/gooddata_sdk/utils.py +++ b/gooddata-sdk/gooddata_sdk/utils.py @@ -9,7 +9,7 @@ from enum import Enum, auto from pathlib import Path from shutil import rmtree -from typing import Any, Callable, NamedTuple, Optional, Union, cast, no_type_check +from typing import Any, Callable, NamedTuple, Union, cast, no_type_check from warnings import warn from xml.etree import ElementTree as ET @@ -427,13 +427,27 @@ def ref_extract(ref: dict[str, Any]) -> Union[str, ObjId]: raise ValueError("invalid ref. must be identifier or localIdentifier") -def filter_for_attributes_labels(attributes: list[Attribute], character_limit: int = 1500) -> Optional[str]: +def filter_for_attributes_labels(attributes: list[Attribute], character_limit: int = 1500) -> list[str]: """ Character limit is to prevent 414 Request-URI Too Large error from server. """ # set(...) does not work deterministically; therefore, it is necessary to use dict.fromkeys label_ids = dict.fromkeys([attribute.label.id for attribute in attributes]) - rsql_query = f"labels.id=in=({','.join(label_ids)})" - if len(rsql_query) < character_limit: - return rsql_query - return None + + longest_id = max(map(len, label_ids)) + assert character_limit >= len("labels.id=in=()") + longest_id, ( + f"Character limit must be at least {len('labels.id=in=()') + longest_id}" + ) + queries = [] + current_batch: list[str] = [] + + for label_id in label_ids: + if len(f"labels.id=in=({','.join(current_batch + [label_id])})") <= character_limit: + current_batch.append(label_id) + else: + queries.append(f"labels.id=in=({','.join(current_batch)})") + current_batch = [label_id] + + if current_batch: # Add remaining batch + queries.append(f"labels.id=in=({','.join(current_batch)})") + return queries