Skip to content

Commit fcceef4

Browse files
committed
feat: optimize memory allocation when converting execution response to dataframe
Add `optimized` flag to DataFrameFactory to enable memory-optimized conversion of execution response to pandas dataframe. Without the flag, the conversion will run as usual, storing headers as a list of dictionaries. The optimized version only stores unique headers and reference them, preventing unnecessary memory allocations when lots of duplicated headers are processed. Note that the new behaviour is optional and turned off by default, so no existing usages should be affected. JIRA: CQ-1579 risk: low
1 parent 82a1c2d commit fcceef4

File tree

4 files changed

+361
-68
lines changed

4 files changed

+361
-68
lines changed

CONTRIBUTING.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ One logical change is done in one commit.
6666

6767
### Documenting new features
6868

69-
To document a new feature, you need to create a new `.md` file in one of the subsections. These subsections reresent the left navigation menu and are in a hierarchical directories.
69+
To document a new feature, you need to create a new `.md` file in one of the subsections. These subsections represent the left navigation menu and are in a hierarchical directories.
7070

7171
e.g.:
7272

gooddata-pandas/gooddata_pandas/dataframe.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -238,6 +238,7 @@ def for_created_visualization(
238238
created_visualizations_response: dict,
239239
on_execution_submitted: Optional[Callable[[Execution], None]] = None,
240240
is_cancellable: bool = False,
241+
optimized: bool = False,
241242
) -> tuple[pandas.DataFrame, DataFrameMetadata]:
242243
"""
243244
Creates a data frame using a created visualization.
@@ -247,6 +248,10 @@ def for_created_visualization(
247248
on_execution_submitted (Optional[Callable[[Execution], None]]): Callback to call when the execution was
248249
submitted to the backend.
249250
is_cancellable (bool, optional): Whether the execution should be cancelled when the connection is interrupted.
251+
optimized (bool, default=False): Use memory optimized accumulator if True; by default, the accumulator stores
252+
headers in memory as lists of dicts, which can consume a lot of memory for large results.
253+
Optimized accumulator stores only unique values and story only reference to them in the list,
254+
which can significantly reduce memory usage.
250255
251256
Returns:
252257
pandas.DataFrame: A DataFrame instance.
@@ -257,6 +262,7 @@ def for_created_visualization(
257262
return self.for_exec_def(
258263
exec_def=execution_definition,
259264
on_execution_submitted=on_execution_submitted,
265+
optimized=optimized,
260266
)
261267

262268
def result_cache_metadata_for_exec_result_id(self, result_id: str) -> ResultCacheMetadata:
@@ -279,6 +285,7 @@ def for_exec_def(
279285
result_size_bytes_limit: Optional[int] = None,
280286
page_size: int = _DEFAULT_PAGE_SIZE,
281287
on_execution_submitted: Optional[Callable[[Execution], None]] = None,
288+
optimized: bool = False,
282289
) -> tuple[pandas.DataFrame, DataFrameMetadata]:
283290
"""
284291
Creates a data frame using an execution definition.
@@ -311,6 +318,10 @@ def for_exec_def(
311318
page_size (int): Number of records per page.
312319
on_execution_submitted (Optional[Callable[[Execution], None]]): Callback to call when the execution was
313320
submitted to the backend.
321+
optimized (bool, default=False): Use memory optimized accumulator if True; by default, the accumulator stores
322+
headers in memory as lists of dicts, which can consume a lot of memory for large results.
323+
Optimized accumulator stores only unique values and story only reference to them in the list,
324+
which can significantly reduce memory usage.
314325
315326
Returns:
316327
Tuple[pandas.DataFrame, DataFrameMetadata]: Tuple holding DataFrame and DataFrame metadata.
@@ -331,6 +342,7 @@ def for_exec_def(
331342
result_size_dimensions_limits=result_size_dimensions_limits,
332343
result_size_bytes_limit=result_size_bytes_limit,
333344
page_size=page_size,
345+
optimized=optimized,
334346
)
335347

336348
def for_exec_result_id(
@@ -343,6 +355,7 @@ def for_exec_result_id(
343355
use_local_ids_in_headers: bool = False,
344356
use_primary_labels_in_attributes: bool = False,
345357
page_size: int = _DEFAULT_PAGE_SIZE,
358+
optimized: bool = False,
346359
) -> tuple[pandas.DataFrame, DataFrameMetadata]:
347360
"""
348361
Retrieves a DataFrame and DataFrame metadata for a given execution result identifier.
@@ -373,6 +386,10 @@ def for_exec_result_id(
373386
use_local_ids_in_headers (bool): Use local identifier in headers.
374387
use_primary_labels_in_attributes (bool): Use primary labels in attributes.
375388
page_size (int): Number of records per page.
389+
optimized (bool, default=False): Use memory optimized accumulator if True; by default, the accumulator stores
390+
headers in memory as lists of dicts, which can consume a lot of memory for large results.
391+
Optimized accumulator stores only unique values and story only reference to them in the list,
392+
which can significantly reduce memory usage.
376393
377394
Returns:
378395
Tuple[pandas.DataFrame, DataFrameMetadata]: Tuple holding DataFrame and DataFrame metadata.
@@ -398,4 +415,5 @@ def for_exec_result_id(
398415
use_local_ids_in_headers=use_local_ids_in_headers,
399416
use_primary_labels_in_attributes=use_primary_labels_in_attributes,
400417
page_size=page_size,
418+
optimized=optimized,
401419
)

0 commit comments

Comments
 (0)