diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index e61ed006a..6a3e6f193 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -66,7 +66,7 @@ One logical change is done in one commit. ### Documenting new features -To document a new feature, you need to create a new `.md` file in one of the subsections. These subsections reresent the left navigation menu and are in a hierarchical directories. +To document a new feature, you need to create a new `.md` file in one of the subsections. These subsections represent the left navigation menu and are in a hierarchical directories. e.g.: diff --git a/gooddata-pandas/gooddata_pandas/dataframe.py b/gooddata-pandas/gooddata_pandas/dataframe.py index aa1342736..e5d3b943f 100644 --- a/gooddata-pandas/gooddata_pandas/dataframe.py +++ b/gooddata-pandas/gooddata_pandas/dataframe.py @@ -238,6 +238,7 @@ def for_created_visualization( created_visualizations_response: dict, on_execution_submitted: Optional[Callable[[Execution], None]] = None, is_cancellable: bool = False, + optimized: bool = False, ) -> tuple[pandas.DataFrame, DataFrameMetadata]: """ Creates a data frame using a created visualization. @@ -247,6 +248,10 @@ def for_created_visualization( on_execution_submitted (Optional[Callable[[Execution], None]]): Callback to call when the execution was submitted to the backend. is_cancellable (bool, optional): Whether the execution should be cancelled when the connection is interrupted. + optimized (bool, default=False): Use memory optimized accumulator if True; by default, the accumulator stores + headers in memory as lists of dicts, which can consume a lot of memory for large results. + Optimized accumulator stores only unique values and story only reference to them in the list, + which can significantly reduce memory usage. Returns: pandas.DataFrame: A DataFrame instance. @@ -257,6 +262,7 @@ def for_created_visualization( return self.for_exec_def( exec_def=execution_definition, on_execution_submitted=on_execution_submitted, + optimized=optimized, ) def result_cache_metadata_for_exec_result_id(self, result_id: str) -> ResultCacheMetadata: @@ -279,6 +285,7 @@ def for_exec_def( result_size_bytes_limit: Optional[int] = None, page_size: int = _DEFAULT_PAGE_SIZE, on_execution_submitted: Optional[Callable[[Execution], None]] = None, + optimized: bool = False, ) -> tuple[pandas.DataFrame, DataFrameMetadata]: """ Creates a data frame using an execution definition. @@ -311,6 +318,10 @@ def for_exec_def( page_size (int): Number of records per page. on_execution_submitted (Optional[Callable[[Execution], None]]): Callback to call when the execution was submitted to the backend. + optimized (bool, default=False): Use memory optimized accumulator if True; by default, the accumulator stores + headers in memory as lists of dicts, which can consume a lot of memory for large results. + Optimized accumulator stores only unique values and story only reference to them in the list, + which can significantly reduce memory usage. Returns: Tuple[pandas.DataFrame, DataFrameMetadata]: Tuple holding DataFrame and DataFrame metadata. @@ -331,6 +342,7 @@ def for_exec_def( result_size_dimensions_limits=result_size_dimensions_limits, result_size_bytes_limit=result_size_bytes_limit, page_size=page_size, + optimized=optimized, ) def for_exec_result_id( @@ -343,6 +355,7 @@ def for_exec_result_id( use_local_ids_in_headers: bool = False, use_primary_labels_in_attributes: bool = False, page_size: int = _DEFAULT_PAGE_SIZE, + optimized: bool = False, ) -> tuple[pandas.DataFrame, DataFrameMetadata]: """ Retrieves a DataFrame and DataFrame metadata for a given execution result identifier. @@ -373,6 +386,10 @@ def for_exec_result_id( use_local_ids_in_headers (bool): Use local identifier in headers. use_primary_labels_in_attributes (bool): Use primary labels in attributes. page_size (int): Number of records per page. + optimized (bool, default=False): Use memory optimized accumulator if True; by default, the accumulator stores + headers in memory as lists of dicts, which can consume a lot of memory for large results. + Optimized accumulator stores only unique values and story only reference to them in the list, + which can significantly reduce memory usage. Returns: Tuple[pandas.DataFrame, DataFrameMetadata]: Tuple holding DataFrame and DataFrame metadata. @@ -398,4 +415,5 @@ def for_exec_result_id( use_local_ids_in_headers=use_local_ids_in_headers, use_primary_labels_in_attributes=use_primary_labels_in_attributes, page_size=page_size, + optimized=optimized, ) diff --git a/gooddata-pandas/gooddata_pandas/result_convertor.py b/gooddata-pandas/gooddata_pandas/result_convertor.py index 301e63a30..6a2002318 100644 --- a/gooddata-pandas/gooddata_pandas/result_convertor.py +++ b/gooddata-pandas/gooddata_pandas/result_convertor.py @@ -1,4 +1,7 @@ # (C) 2022 GoodData Corporation +from abc import ABC, abstractmethod +from collections.abc import Iterator +from functools import cached_property from typing import Any, Callable, Optional, Union, cast import pandas @@ -11,6 +14,163 @@ LabelOverrides = dict[str, dict[str, dict[str, str]]] +@define(frozen=True, slots=True) +class _Header(ABC): + """ + Abstract base class for headers. There are 4 types of headers: + - attribute header with attribute value and primary label value + - attribute header with label name and label identifier + - measure header + - total header + + We convert dict representation to _Header objects with slots to improve memory usage. + """ + + @cached_property + @abstractmethod + def _dict(self) -> dict[str, Any]: + pass + + def get(self, key: str, default: Optional[Any] = None) -> Optional[Any]: + return self._dict.get(key, default) + + +@define(frozen=True, slots=True) +class _AttributeValuePrimary(_Header): + """ + Attribute header with label value and primary label value. + """ + + label_value: str + primary_label_value: str + + @cached_property + def _dict(self) -> dict[str, Any]: + return {"attributeHeader": {"labelValue": self.label_value, "primaryLabelValue": self.primary_label_value}} + + +@define(frozen=True, slots=True) +class _AttributeNameLocal(_Header): + """ + Attribute header with label name and label identifier. + """ + + label_name: str + local_identifier: str + + @cached_property + def _dict(self) -> dict[str, Any]: + return {"attributeHeader": {"labelName": self.label_name, "localIdentifier": self.local_identifier}} + + +@define(frozen=True, slots=True) +class _MeasureHeader(_Header): + """ + Measure header. + """ + + measure_index: str + + @cached_property + def _dict(self) -> dict[str, Any]: + return {"measureHeader": {"measureIndex": self.measure_index}} + + +@define(frozen=True, slots=True) +class _TotalHeader(_Header): + """ + Total header. + """ + + function: str + + @cached_property + def _dict(self) -> dict[str, Any]: + return {"totalHeader": {"function": self.function}} + + +def _header_from_dict(d: dict[str, Any]) -> Optional[_Header]: + """ + Convert dict representation to _Header object. + :param d: dictionary representation of a header + :return: _Header object or None if the dictionary does not represent a header or if it's not supported. + However, we expect that all execution results contain correct data. + """ + if attribute_header := d.get("attributeHeader"): + if "labelValue" in attribute_header: + return _AttributeValuePrimary( + label_value=attribute_header["labelValue"], primary_label_value=attribute_header["primaryLabelValue"] + ) + if "labelName" in attribute_header: + return _AttributeNameLocal( + label_name=attribute_header["labelName"], local_identifier=attribute_header["localIdentifier"] + ) + return None + + if measure_header := d.get("measureHeader"): + return _MeasureHeader(measure_header["measureIndex"]) + + if total_header := d.get("totalHeader"): + return _TotalHeader(total_header["function"]) + + return None + + +@define +class _HeaderContainer: + """ + Container for headers to improve memory usage. + Unique headers are stored as keys in _header_cache and references to them are stored in _headers. + This way we avoid storing the same header multiple times, reducing memory allocations, + which is important for large datatables with many attributes. + """ + + _headers: list[_Header] = field(factory=list) + _header_cache: dict[_Header, _Header] = field(factory=dict) + + def append(self, header_dict: dict) -> None: + """ + Add header to the container. + + First, try to convert header dict to _Header object, and return early if it's not possible. + Then, check if the header is already in the container. + If it is, get a pointer to the existing header and add it to the container. + If it is not, add it to the container. + """ + + header = _header_from_dict(header_dict) + if header is None: + return + + if header not in self._header_cache: + self._header_cache[header] = header + self._headers.append(self._header_cache[header]) + + def extend(self, header_dicts: list[dict]) -> None: + """ + Add multiple headers to the container. + """ + for header_dict in header_dicts: + self.append(header_dict) + + def __iter__(self) -> Iterator[_Header]: + yield from self._headers + + def __len__(self) -> int: + return len(self._headers) + + def __getitem__(self, index: int) -> _Header: + return self._headers[index] + + +_DataHeaderContainers = list[_HeaderContainer] + +# Optimized version of _DataWithHeaders uses _HeaderContainer instead of list of headers +_HeadersByAxis = tuple[ + Union[_DataHeaders, _DataHeaderContainers], Union[Optional[_DataHeaders], Optional[_DataHeaderContainers]] +] + + @frozen class _DataWithHeaders: """Extracted data; either array of values for one-dimensional result or array of arrays of values. @@ -18,7 +178,7 @@ class _DataWithHeaders: Attributes: data (List[_DataArray]): Extracted data; either array of values for one-dimensional result or array of arrays of values. - data_headers (Tuple[_DataHeaders, Optional[_DataHeaders]]): + data_headers (_HeadersByAxis): Per-dimension headers for the data. grand_totals (Tuple[Optional[List[_DataArray]], Optional[List[_DataArray]]]): Per-dimension grand total data. @@ -27,32 +187,34 @@ class _DataWithHeaders: """ data: list[_DataArray] - data_headers: tuple[_DataHeaders, Optional[_DataHeaders]] + data_headers: _HeadersByAxis grand_totals: tuple[Optional[list[_DataArray]], Optional[list[_DataArray]]] grand_total_headers: tuple[Optional[list[dict[str, _DataHeaders]]], Optional[list[dict[str, _DataHeaders]]]] @define -class _AccumulatedData: +class _AbstractAccumulatedData(ABC): """ Utility class to offload code from the function that extracts all data and headers for a particular paged result. The method drives the paging and calls out to this class to accumulate the essential data and headers from the page. + Note that if optimized is enabled, the data_headers are stored in _HeaderContainer instead of list of headers. + We do not store grand_totals_headers in _HeaderContainer, as we do not except Attributes: data (List[_DataArray]): Holds the accumulated data arrays from the pages. - data_headers (List[Optional[_DataHeaders]]): Holds the headers for data arrays. + data_headers (List[Optional[Any]]): Holds the headers for data arrays. grand_totals (List[Optional[List[_DataArray]]]): Holds the grand total data arrays. grand_totals_headers (List[Optional[_DataHeaders]]): Holds the headers for grand total data arrays. """ data: list[_DataArray] = field(init=False, factory=list) - data_headers: list[Optional[_DataHeaders]] = field(init=False, factory=lambda: [None, None]) + data_headers: list[Optional[Any]] = field(init=False, factory=lambda: [None, None]) grand_totals: list[Optional[list[_DataArray]]] = field(init=False, factory=lambda: [None, None]) + total_of_grant_totals_processed: bool = field(init=False, default=False) grand_totals_headers: list[Optional[list[dict[str, _DataHeaders]]]] = field( init=False, factory=lambda: [None, None] ) - total_of_grant_totals_processed: bool = field(init=False, default=False) def accumulate_data(self, from_result: ExecutionResult) -> None: """ @@ -79,24 +241,6 @@ def extend_existing_row_data(self, from_result: ExecutionResult) -> None: for i in range(len(from_result.data)): self.data[offset + i].extend(from_result.data[i]) - def accumulate_headers(self, from_result: ExecutionResult, from_dim: int) -> None: - """ - Accumulate headers for a particular dimension of a result into the provided `data_headers` array at the index - matching the dimension index. - - This will mutate the `data_headers`. - - Args: - from_result (ExecutionResult): The result whose headers will be accumulated. - from_dim (int): The dimension index. - """ - - if self.data_headers[from_dim] is None: - self.data_headers[from_dim] = from_result.get_all_headers(dim=from_dim) - else: - for idx, headers in enumerate(from_result.get_all_headers(dim=from_dim)): - cast(_DataHeaders, self.data_headers[from_dim])[idx].extend(headers) - def accumulate_grand_totals( self, from_result: ExecutionResult, paging_dim: int, response: BareExecutionResponse ) -> None: @@ -161,6 +305,56 @@ def accumulate_grand_totals( # have row totals and paging down, keep adding extra rows grand_totals_item.extend(grand_total["data"]) + @abstractmethod + def accumulate_headers(self, from_result: ExecutionResult, from_dim: int) -> None: + """ + Accumulate headers for a particular dimension of a result into the provided `data_headers` array at the index + matching the dimension index. + + This will mutate the `data_headers`. + + Args: + from_result (ExecutionResult): The result whose headers will be accumulated. + from_dim (int): The dimension index. + """ + + @abstractmethod + def result(self) -> _DataWithHeaders: + """ + Returns the data with headers. + + Returns: + _DataWithHeaders: The data, data headers, grand totals and grand total headers. + """ + + +@define +class _AccumulatedData(_AbstractAccumulatedData): + """ + Implementation of _AbstractAccumulatedData that uses list of dicts as storage, + which is used when non-optimized data extraction is used. + + This implementation may lead to uncontrolled memory usage for large results. + """ + + def accumulate_headers(self, from_result: ExecutionResult, from_dim: int) -> None: + """ + Accumulate headers for a particular dimension of a result into the provided `data_headers` array at the index + matching the dimension index. + + This will mutate the `data_headers`. + + Args: + from_result (ExecutionResult): The result whose headers will be accumulated. + from_dim (int): The dimension index. + """ + + if self.data_headers[from_dim] is None: + self.data_headers[from_dim] = from_result.get_all_headers(dim=from_dim) + else: + for idx, headers in enumerate(from_result.get_all_headers(dim=from_dim)): + cast(_DataHeaders, self.data_headers[from_dim])[idx].extend(headers) + def result(self) -> _DataWithHeaders: """ Returns the data with headers. @@ -176,6 +370,55 @@ def result(self) -> _DataWithHeaders: ) +@define +class _OptimizedAccumulatedData(_AbstractAccumulatedData): + """ + Implementation of _AbstractAccumulatedData that stores headers in _HeaderContainer objects, + which is used when optimized data extraction is used. + + This implementation is more memory efficient than _AccumulatedData. + """ + + def accumulate_headers(self, from_result: ExecutionResult, from_dim: int) -> None: + """ + Accumulate headers for a particular dimension of a result into the provided `data_headers` array at the index + matching the dimension index. + + This will mutate the `data_headers`. + + Args: + from_result (ExecutionResult): The result whose headers will be accumulated. + from_dim (int): The dimension index. + """ + + if containers := self.data_headers[from_dim]: + for idx, headers in enumerate(from_result.get_all_headers(dim=from_dim)): + containers[idx].extend(headers) + else: + self.data_headers[from_dim] = [] + containers = [] + for idx, headers in enumerate(from_result.get_all_headers(dim=from_dim)): + hc = _HeaderContainer() + hc.extend(headers) + containers.append(hc) + self.data_headers[from_dim] = containers + + def result(self) -> _DataWithHeaders: + """ + Returns the data with headers. + + Returns: + _DataWithHeaders: The data, data headers, grand totals and grand total headers. + """ + + return _DataWithHeaders( + data=self.data, + data_headers=(cast(_DataHeaderContainers, self.data_headers[0]), self.data_headers[1]), + grand_totals=(self.grand_totals[0], self.grand_totals[1]), + grand_total_headers=(self.grand_totals_headers[0], self.grand_totals_headers[1]), + ) + + @define class DataFrameMetadata: """ @@ -206,19 +449,20 @@ class DataFrameMetadata: @classmethod def from_data( cls, - headers: tuple[_DataHeaders, Optional[_DataHeaders]], + headers: _HeadersByAxis, execution_response: BareExecutionResponse, primary_labels_from_index: dict[int, dict[str, str]], primary_labels_from_columns: dict[int, dict[str, str]], ) -> "DataFrameMetadata": """This method constructs a DataFrameMetadata object from data headers and an execution response. - Args: headers (Tuple[_DataHeaders, Optional[_DataHeaders]]): + Args: headers (_HeadersByAxis): A tuple containing data headers. execution_response (BareExecutionResponse): An ExecutionResponse object. Returns: DataFrameMetadata: An initialized DataFrameMetadata object.""" row_totals_indexes = [ - [idx for idx, hdr in enumerate(dim) if hdr is not None and "totalHeader" in hdr] for dim in headers[0] + [idx for idx, hdr in enumerate(dim) if hdr is not None and hdr.get("totalHeader") is not None] + for dim in headers[0] ] return cls( row_totals_indexes=row_totals_indexes, @@ -234,6 +478,7 @@ def _read_complete_execution_result( result_size_dimensions_limits: ResultSizeDimensions, result_size_bytes_limit: Optional[int] = None, page_size: int = _DEFAULT_PAGE_SIZE, + optimized: bool = False, ) -> _DataWithHeaders: """ Extracts all data and headers for an execution result. This does page around the execution result to extract @@ -245,6 +490,10 @@ def _read_complete_execution_result( result_size_dimensions_limits (ResultSizeDimensions): Limits for result size dimensions. result_size_bytes_limit (Optional[int], optional): Limit for result size in bytes. Defaults to None. page_size (int, optional): Page size to use when reading data. Defaults to _DEFAULT_PAGE_SIZE. + optimized (bool, default=False): Use memory optimized accumulator if True; by default, the accumulator stores + headers in memory as lists of dicts, which can consume a lot of memory for large results. + Optimized accumulator stores only unique values and story only reference to them in the list, + which can significantly reduce memory usage. Returns: _DataWithHeaders: All the data and headers from the execution result. @@ -252,10 +501,10 @@ def _read_complete_execution_result( num_dims = len(execution_response.dimensions) offset = [0] * num_dims limit = [page_size] * num_dims - acc = _AccumulatedData() - result_size_limits_checked = False + acc = _OptimizedAccumulatedData() if optimized else _AccumulatedData() + result_size_limits_checked = False while True: # top-level loop pages through the first dimension; # @@ -303,7 +552,6 @@ def _read_complete_execution_result( break offset = [result.next_page_start(dim=0), 0] if num_dims > 1 else [result.next_page_start(dim=0)] - return acc.result() @@ -339,14 +587,14 @@ def _create_header_mapper( attribute_labels = label_overrides.get("labels", {}) measure_labels = label_overrides.get("metrics", {}) - def _mapper(header: Any, header_idx: Optional[int]) -> Optional[str]: + def _mapper(header: Union[dict, _Header, None], header_idx: Optional[int]) -> Optional[str]: label = None if header is None: pass - elif "attributeHeader" in header: - if "labelValue" in header["attributeHeader"]: - label_value = header["attributeHeader"]["labelValue"] - primary_label_value = header["attributeHeader"]["primaryLabelValue"] + elif attribute_header := header.get("attributeHeader"): + if "labelValue" in attribute_header: + label_value = attribute_header["labelValue"] + primary_label_value = attribute_header["primaryLabelValue"] label = primary_label_value if use_primary_labels_in_attributes else label_value if header_idx is not None: if header_idx in primary_attribute_labels_mapping: @@ -359,17 +607,18 @@ def _mapper(header: Any, header_idx: Optional[int]) -> Optional[str]: # Excel formatter apply call failure if label is None: label = " " - elif "labelName" in header["attributeHeader"]: - attr_local_id = header["attributeHeader"]["localIdentifier"] + elif "labelName" in attribute_header: + attr_local_id = attribute_header["localIdentifier"] if use_local_ids_in_headers: label = attr_local_id else: if attr_local_id in attribute_labels: label = attribute_labels[attr_local_id]["title"] else: - label = header["attributeHeader"]["labelName"] - elif "measureHeader" in header and header_idx is not None: - measure_idx = header["measureHeader"]["measureIndex"] + label = attribute_header["labelName"] + + elif (measure_header := header.get("measureHeader")) and header_idx is not None: + measure_idx = measure_header["measureIndex"] measure_descriptor = dim_descriptor["headers"][header_idx]["measureGroupHeaders"][measure_idx] if use_local_ids_in_headers: @@ -381,8 +630,9 @@ def _mapper(header: Any, header_idx: Optional[int]) -> Optional[str]: label = measure_descriptor["name"] else: label = measure_descriptor["localIdentifier"] - elif "totalHeader" in header: - label = header["totalHeader"]["function"] + + elif total_header := header.get("totalHeader"): + label = total_header["function"] return label return _mapper @@ -390,7 +640,7 @@ def _mapper(header: Any, header_idx: Optional[int]) -> Optional[str]: def _headers_to_index( dim_idx: int, - headers: tuple[_DataHeaders, Optional[_DataHeaders]], + headers: _HeadersByAxis, response: BareExecutionResponse, label_overrides: LabelOverrides, use_local_ids_in_headers: bool = False, @@ -432,7 +682,7 @@ def _headers_to_index( return pandas.MultiIndex.from_arrays( [ tuple(mapper(header, header_idx) for header in header_group) - for header_idx, header_group in enumerate(cast(_DataHeaders, headers[dim_idx])) + for header_idx, header_group in enumerate(cast(list, headers[dim_idx])) ], names=[mapper(dim_header, None) for dim_header in (response.dimensions[dim_idx]["headers"])], ), primary_attribute_labels_mapping @@ -465,17 +715,17 @@ def _merge_grand_totals_into_data(extract: _DataWithHeaders) -> Union[_DataArray return data -def _merge_grand_total_headers_into_headers(extract: _DataWithHeaders) -> tuple[_DataHeaders, Optional[_DataHeaders]]: +def _merge_grand_total_headers_into_headers(extract: _DataWithHeaders) -> _HeadersByAxis: """Merges grand total headers into data headers. This function will mutate the extracted data. Args: extract (_DataWithHeaders): The data along with its headers that need to be merged. Returns: - Tuple[_DataHeaders, Optional[_DataHeaders]]: + _HeadersByAxis: A tuple containing the modified data headers and the grand total headers if present. """ - headers: tuple[_DataHeaders, Optional[_DataHeaders]] = extract.data_headers + headers: _HeadersByAxis = extract.data_headers for dim_idx, grand_total_headers in enumerate(extract.grand_total_headers): if grand_total_headers is None: @@ -496,6 +746,7 @@ def convert_execution_response_to_dataframe( use_local_ids_in_headers: bool = False, use_primary_labels_in_attributes: bool = False, page_size: int = _DEFAULT_PAGE_SIZE, + optimized: bool = False, ) -> tuple[pandas.DataFrame, DataFrameMetadata]: """ Converts execution result to a pandas dataframe, maintaining the dimensionality of the result. @@ -511,6 +762,10 @@ def convert_execution_response_to_dataframe( use_primary_labels_in_attributes (bool, default=False): Use primary labels in attributes if True, else use default settings. page_size (int, default=_DEFAULT_PAGE_SIZE): Size of the page. + optimized (bool, default=False): Use memory optimized accumulator if True; by default, the accumulator stores + headers in memory as lists of dicts, which can consume a lot of memory for large results. + Optimized accumulator stores only unique values and story only reference to them in the list, + which can significantly reduce memory usage. Returns: Tuple[pandas.DataFrame, DataFrameMetadata]: A tuple containing the created dataframe and its metadata. @@ -521,7 +776,9 @@ def convert_execution_response_to_dataframe( result_size_dimensions_limits=result_size_dimensions_limits, result_size_bytes_limit=result_size_bytes_limit, page_size=page_size, + optimized=optimized, ) + full_data = _merge_grand_totals_into_data(extract) full_headers = _merge_grand_total_headers_into_headers(extract) diff --git a/gooddata-pandas/tests/dataframe/test_dataframe_for_exec_def.py b/gooddata-pandas/tests/dataframe/test_dataframe_for_exec_def.py index 2f1fcc812..4973292ac 100644 --- a/gooddata-pandas/tests/dataframe/test_dataframe_for_exec_def.py +++ b/gooddata-pandas/tests/dataframe/test_dataframe_for_exec_def.py @@ -2,6 +2,7 @@ from pathlib import Path from typing import Optional +import pytest from gooddata_pandas import DataFrameFactory from gooddata_sdk import ( Attribute, @@ -28,6 +29,7 @@ def _run_and_validate_results( expected: tuple[int, int], expected_row_totals: Optional[list[list[int]]] = None, page_size: int = 100, + optimized: bool = False, ) -> str: # generate dataframe from exec_def result, result_metadata = gdf.for_exec_def(exec_def=exec_def, page_size=page_size) @@ -35,7 +37,7 @@ def _run_and_validate_results( # use result ID from computation above and generate dataframe just from it result_from_result_id, result_metadata_from_result_id = gdf.for_exec_result_id( - result_id=result_metadata.execution_response.result_id, page_size=page_size + result_id=result_metadata.execution_response.result_id, page_size=page_size, optimized=optimized ) if expected_row_totals is not None: @@ -50,7 +52,8 @@ def _run_and_validate_results( @gd_vcr.use_cassette(str(_fixtures_dir / "dataframe_for_exec_def_two_dim1.yaml")) -def test_dataframe_for_exec_def_two_dim1(test_config, gdf: DataFrameFactory): +@pytest.mark.parametrize("optimized", [True, False]) +def test_dataframe_for_exec_def_two_dim1(test_config, gdf: DataFrameFactory, optimized: bool): exec_def = ExecutionDefinition( attributes=[ Attribute(local_id="region", label="region"), @@ -67,7 +70,7 @@ def test_dataframe_for_exec_def_two_dim1(test_config, gdf: DataFrameFactory): TableDimension(item_ids=["product_category", "measureGroup"]), ], ) - exec_result_id = _run_and_validate_results(gdf=gdf, exec_def=exec_def, expected=(48, 8)) + exec_result_id = _run_and_validate_results(gdf=gdf, exec_def=exec_def, expected=(48, 8), optimized=optimized) # check also label overrides overrides = { @@ -78,7 +81,7 @@ def test_dataframe_for_exec_def_two_dim1(test_config, gdf: DataFrameFactory): "price": {"title": "PRICE LABEL"}, }, } - result, _ = gdf.for_exec_result_id(exec_result_id, label_overrides=overrides) + result, _ = gdf.for_exec_result_id(exec_result_id, label_overrides=overrides, optimized=optimized) assert result.to_string().find(overrides["labels"]["state"]["title"]) == 262 assert result.to_string().find(overrides["metrics"]["price"]["title"]) == 162 @@ -145,7 +148,8 @@ def test_dataframe_for_exec_def_bytes_limits_failure(test_config, gdf: DataFrame @gd_vcr.use_cassette(str(_fixtures_dir / "dataframe_for_exec_def_two_dim2.yaml")) -def test_dataframe_for_exec_def_two_dim2(gdf: DataFrameFactory): +@pytest.mark.parametrize("optimized", [True, False]) +def test_dataframe_for_exec_def_two_dim2(gdf: DataFrameFactory, optimized: bool): exec_def = ExecutionDefinition( attributes=[ Attribute(local_id="region", label="region"), @@ -162,11 +166,12 @@ def test_dataframe_for_exec_def_two_dim2(gdf: DataFrameFactory): TableDimension(item_ids=["measureGroup"]), ], ) - _run_and_validate_results(gdf=gdf, exec_def=exec_def, expected=(182, 2)) + _run_and_validate_results(gdf=gdf, exec_def=exec_def, expected=(182, 2), optimized=optimized) @gd_vcr.use_cassette(str(_fixtures_dir / "dataframe_for_exec_def_two_dim3.yaml")) -def test_dataframe_for_exec_def_two_dim3(gdf: DataFrameFactory): +@pytest.mark.parametrize("optimized", [True, False]) +def test_dataframe_for_exec_def_two_dim3(gdf: DataFrameFactory, optimized: bool): exec_def = ExecutionDefinition( attributes=[ Attribute(local_id="region", label="region"), @@ -183,11 +188,12 @@ def test_dataframe_for_exec_def_two_dim3(gdf: DataFrameFactory): TableDimension(item_ids=["region", "state", "measureGroup"]), ], ) - _run_and_validate_results(gdf=gdf, exec_def=exec_def, expected=(4, 96)) + _run_and_validate_results(gdf=gdf, exec_def=exec_def, expected=(4, 96), optimized=optimized) @gd_vcr.use_cassette(str(_fixtures_dir / "dataframe_for_exec_def_totals1.yaml")) -def test_dataframe_for_exec_def_totals1(gdf: DataFrameFactory): +@pytest.mark.parametrize("optimized", [True, False]) +def test_dataframe_for_exec_def_totals1(gdf: DataFrameFactory, optimized: bool): """ Execution with column totals; the row dimension has single label """ @@ -221,11 +227,14 @@ def test_dataframe_for_exec_def_totals1(gdf: DataFrameFactory): ), ], ) - _run_and_validate_results(gdf=gdf, exec_def=exec_def, expected=(6, 96), expected_row_totals=[[4, 5]]) + _run_and_validate_results( + gdf=gdf, exec_def=exec_def, expected=(6, 96), expected_row_totals=[[4, 5]], optimized=optimized + ) @gd_vcr.use_cassette(str(_fixtures_dir / "dataframe_for_exec_def_totals2.yaml")) -def test_dataframe_for_exec_def_totals2(gdf: DataFrameFactory): +@pytest.mark.parametrize("optimized", [True, False]) +def test_dataframe_for_exec_def_totals2(gdf: DataFrameFactory, optimized: bool): """ Execution with column totals; the row dimension have two labels; this exercises that the index is padded appropriately @@ -260,11 +269,14 @@ def test_dataframe_for_exec_def_totals2(gdf: DataFrameFactory): ), ], ) - _run_and_validate_results(gdf=gdf, exec_def=exec_def, expected=(19, 96), expected_row_totals=[[17, 18], [17, 18]]) + _run_and_validate_results( + gdf=gdf, exec_def=exec_def, expected=(19, 96), expected_row_totals=[[17, 18], [17, 18]], optimized=optimized + ) @gd_vcr.use_cassette(str(_fixtures_dir / "dataframe_for_exec_def_totals3.yaml")) -def test_dataframe_for_exec_def_totals3(gdf: DataFrameFactory): +@pytest.mark.parametrize("optimized", [True, False]) +def test_dataframe_for_exec_def_totals3(gdf: DataFrameFactory, optimized: bool): """ Execution with row totals; the column dimension has single label. """ @@ -298,11 +310,12 @@ def test_dataframe_for_exec_def_totals3(gdf: DataFrameFactory): ), ], ) - _run_and_validate_results(gdf=gdf, exec_def=exec_def, expected=(96, 6)) + _run_and_validate_results(gdf=gdf, exec_def=exec_def, expected=(96, 6), optimized=optimized) @gd_vcr.use_cassette(str(_fixtures_dir / "dataframe_for_exec_def_totals4.yaml")) -def test_dataframe_for_exec_def_totals4(gdf: DataFrameFactory): +@pytest.mark.parametrize("optimized", [True, False]) +def test_dataframe_for_exec_def_totals4(gdf: DataFrameFactory, optimized: bool): """ Execution with row totals; the column dimension have two label. """ @@ -336,7 +349,7 @@ def test_dataframe_for_exec_def_totals4(gdf: DataFrameFactory): ), ], ) - _run_and_validate_results(gdf=gdf, exec_def=exec_def, expected=(96, 19)) + _run_and_validate_results(gdf=gdf, exec_def=exec_def, expected=(96, 19), optimized=optimized) # TODO - not implemented yet @@ -377,7 +390,8 @@ def test_dataframe_for_exec_def_totals4(gdf: DataFrameFactory): @gd_vcr.use_cassette(str(_fixtures_dir / "dataframe_for_exec_def_one_dim1.yaml")) -def test_dataframe_for_exec_def_one_dim1(gdf: DataFrameFactory): +@pytest.mark.parametrize("optimized", [True, False]) +def test_dataframe_for_exec_def_one_dim1(gdf: DataFrameFactory, optimized: bool): exec_def = ExecutionDefinition( attributes=[ Attribute(local_id="region", label="region"), @@ -392,11 +406,12 @@ def test_dataframe_for_exec_def_one_dim1(gdf: DataFrameFactory): dimensions=[TableDimension(item_ids=["region", "state", "product_category", "measureGroup"])], ) # TODO: remove page_size=500 once UNI-591 is resolved - _run_and_validate_results(gdf=gdf, exec_def=exec_def, expected=(364, 1), page_size=500) + _run_and_validate_results(gdf=gdf, exec_def=exec_def, expected=(364, 1), page_size=500, optimized=optimized) @gd_vcr.use_cassette(str(_fixtures_dir / "dataframe_for_exec_def_one_dim2.yaml")) -def test_dataframe_for_exec_def_one_dim2(gdf: DataFrameFactory): +@pytest.mark.parametrize("optimized", [True, False]) +def test_dataframe_for_exec_def_one_dim2(gdf: DataFrameFactory, optimized: bool): exec_def = ExecutionDefinition( attributes=[ Attribute(local_id="region", label="region"), @@ -413,4 +428,4 @@ def test_dataframe_for_exec_def_one_dim2(gdf: DataFrameFactory): TableDimension(item_ids=["region", "state", "product_category", "measureGroup"]), ], ) - _run_and_validate_results(gdf=gdf, exec_def=exec_def, expected=(1, 364)) + _run_and_validate_results(gdf=gdf, exec_def=exec_def, expected=(1, 364), optimized=optimized)