From 6e633a2a9dc9bd577071b499a23514cb18a5c391 Mon Sep 17 00:00:00 2001 From: Paul Spangler <7519484+spanglerco@users.noreply.github.com> Date: Mon, 4 Aug 2025 13:02:38 -0500 Subject: [PATCH 01/28] WIP --- nisystemlink/clients/core/_uplink/_methods.py | 8 ++- .../clients/dataframe/_data_frame_client.py | 62 ++++++++++++++++++- poetry.lock | 58 ++++++++++++++++- pyproject.toml | 1 + 4 files changed, 125 insertions(+), 4 deletions(-) diff --git a/nisystemlink/clients/core/_uplink/_methods.py b/nisystemlink/clients/core/_uplink/_methods.py index 6cd187f0..bc78d15d 100644 --- a/nisystemlink/clients/core/_uplink/_methods.py +++ b/nisystemlink/clients/core/_uplink/_methods.py @@ -5,6 +5,7 @@ from uplink import ( Body, commands, + headers, json, response_handler as uplink_response_handler, returns, @@ -26,13 +27,18 @@ def post( path: str, args: Optional[Sequence[Any]] = None, return_key: Optional[Union[str, Tuple[str, ...]]] = None, + content_type: Optional[str] = None, ) -> Callable[[F], F]: """Annotation for a POST request with a JSON request body. If args is not specified, defaults to a single argument that represents the request body. """ def decorator(func: F) -> F: - result = json(commands.post(path, args=args or (Body,))(func)) + result = commands.post(path, args=args or (Body,))(func) + if content_type: + result = headers({"Content-Type": content_type})(result) + else: + result = json(result) # type: ignore if return_key: result = returns.json(key=return_key)(result) return result # type: ignore diff --git a/nisystemlink/clients/dataframe/_data_frame_client.py b/nisystemlink/clients/dataframe/_data_frame_client.py index 8b976985..22469883 100644 --- a/nisystemlink/clients/dataframe/_data_frame_client.py +++ b/nisystemlink/clients/dataframe/_data_frame_client.py @@ -1,5 +1,8 @@ """Implementation of DataFrameClient.""" +import pyarrow as pa +from collections.abc import Iterable +from io import BytesIO from typing import List, Optional from nisystemlink.clients import core @@ -13,7 +16,7 @@ ) from nisystemlink.clients.core.helpers import IteratorFileLike from requests.models import Response -from uplink import Body, Field, Path, Query, retry +from uplink import Body, Field, Path, Query, headers, retry from . import models @@ -263,6 +266,61 @@ def append_table_data(self, id: str, data: models.AppendTableDataRequest) -> Non """ ... + def append_table_data2( + self, + id: str, + data: Iterable[pa.RecordBatch], + end_of_data: bool = False, + ) -> None: + """Appends one or more rows of data to the table identified by its ID. + + Args: + id: Unique ID of a data table. + data: The data to append. + end_of_data: Whether the table should expect any additional rows to be + appended in future requests. + + Raises: + ApiException: if unable to communicate with the DataFrame Service + or provided an invalid argument. + """ + + def _generate_body() -> Iterable[memoryview]: + data_iter = iter(data) + try: + batch = next(data_iter) + except StopIteration: + return + with BytesIO() as buf: + options = pa.ipc.IpcWriteOptions(compression="zstd") + writer = pa.ipc.new_stream(buf, batch.schema, options=options) + + while True: + writer.write_batch(batch) + with buf.getbuffer() as view, view[0 : buf.tell()] as slice: + yield slice + buf.seek(0) + try: + batch = next(data_iter) + except StopIteration: + break + + writer.close() + with buf.getbuffer() as view: + with view[0 : buf.tell()] as slice: + yield slice + + return self.__append_table_data_arrow(id, _generate_body(), end_of_data) + + @post( + "tables/{id}/data", + args=[Path, Body, Query], + content_type="application/vnd.apache.arrow.stream", + ) + def __append_table_data_arrow( + self, id: str, data: Iterable[bytes], end_of_data: bool + ) -> None: ... + @post("tables/{id}/query-data", args=[Path, Body]) def query_table_data( self, id: str, query: models.QueryTableDataRequest @@ -301,7 +359,7 @@ def query_decimated_data( """ ... - def _iter_content_filelike_wrapper(response: Response) -> IteratorFileLike: + def _iter_content_filelike_wrapper(response: Response) -> IteratorFileLike: # type: ignore return IteratorFileLike(response.iter_content(chunk_size=4096)) @response_handler(_iter_content_filelike_wrapper) diff --git a/poetry.lock b/poetry.lock index a4aeb10b..9abcc72f 100644 --- a/poetry.lock +++ b/poetry.lock @@ -774,6 +774,62 @@ tomli = ">=1.2.2" [package.extras] poetry-plugin = ["poetry (>=1.0,<2.0)"] +[[package]] +name = "pyarrow" +version = "21.0.0" +description = "Python library for Apache Arrow" +optional = false +python-versions = ">=3.9" +groups = ["main"] +files = [ + {file = "pyarrow-21.0.0-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:e563271e2c5ff4d4a4cbeb2c83d5cf0d4938b891518e676025f7268c6fe5fe26"}, + {file = "pyarrow-21.0.0-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:fee33b0ca46f4c85443d6c450357101e47d53e6c3f008d658c27a2d020d44c79"}, + {file = "pyarrow-21.0.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:7be45519b830f7c24b21d630a31d48bcebfd5d4d7f9d3bdb49da9cdf6d764edb"}, + {file = "pyarrow-21.0.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:26bfd95f6bff443ceae63c65dc7e048670b7e98bc892210acba7e4995d3d4b51"}, + {file = "pyarrow-21.0.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:bd04ec08f7f8bd113c55868bd3fc442a9db67c27af098c5f814a3091e71cc61a"}, + {file = "pyarrow-21.0.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:9b0b14b49ac10654332a805aedfc0147fb3469cbf8ea951b3d040dab12372594"}, + {file = "pyarrow-21.0.0-cp310-cp310-win_amd64.whl", hash = "sha256:9d9f8bcb4c3be7738add259738abdeddc363de1b80e3310e04067aa1ca596634"}, + {file = "pyarrow-21.0.0-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:c077f48aab61738c237802836fc3844f85409a46015635198761b0d6a688f87b"}, + {file = "pyarrow-21.0.0-cp311-cp311-macosx_12_0_x86_64.whl", hash = "sha256:689f448066781856237eca8d1975b98cace19b8dd2ab6145bf49475478bcaa10"}, + {file = "pyarrow-21.0.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:479ee41399fcddc46159a551705b89c05f11e8b8cb8e968f7fec64f62d91985e"}, + {file = "pyarrow-21.0.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:40ebfcb54a4f11bcde86bc586cbd0272bac0d516cfa539c799c2453768477569"}, + {file = "pyarrow-21.0.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:8d58d8497814274d3d20214fbb24abcad2f7e351474357d552a8d53bce70c70e"}, + {file = "pyarrow-21.0.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:585e7224f21124dd57836b1530ac8f2df2afc43c861d7bf3d58a4870c42ae36c"}, + {file = "pyarrow-21.0.0-cp311-cp311-win_amd64.whl", hash = "sha256:555ca6935b2cbca2c0e932bedd853e9bc523098c39636de9ad4693b5b1df86d6"}, + {file = "pyarrow-21.0.0-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:3a302f0e0963db37e0a24a70c56cf91a4faa0bca51c23812279ca2e23481fccd"}, + {file = "pyarrow-21.0.0-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:b6b27cf01e243871390474a211a7922bfbe3bda21e39bc9160daf0da3fe48876"}, + {file = "pyarrow-21.0.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:e72a8ec6b868e258a2cd2672d91f2860ad532d590ce94cdf7d5e7ec674ccf03d"}, + {file = "pyarrow-21.0.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:b7ae0bbdc8c6674259b25bef5d2a1d6af5d39d7200c819cf99e07f7dfef1c51e"}, + {file = "pyarrow-21.0.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:58c30a1729f82d201627c173d91bd431db88ea74dcaa3885855bc6203e433b82"}, + {file = "pyarrow-21.0.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:072116f65604b822a7f22945a7a6e581cfa28e3454fdcc6939d4ff6090126623"}, + {file = "pyarrow-21.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:cf56ec8b0a5c8c9d7021d6fd754e688104f9ebebf1bf4449613c9531f5346a18"}, + {file = "pyarrow-21.0.0-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:e99310a4ebd4479bcd1964dff9e14af33746300cb014aa4a3781738ac63baf4a"}, + {file = "pyarrow-21.0.0-cp313-cp313-macosx_12_0_x86_64.whl", hash = "sha256:d2fe8e7f3ce329a71b7ddd7498b3cfac0eeb200c2789bd840234f0dc271a8efe"}, + {file = "pyarrow-21.0.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:f522e5709379d72fb3da7785aa489ff0bb87448a9dc5a75f45763a795a089ebd"}, + {file = "pyarrow-21.0.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:69cbbdf0631396e9925e048cfa5bce4e8c3d3b41562bbd70c685a8eb53a91e61"}, + {file = "pyarrow-21.0.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:731c7022587006b755d0bdb27626a1a3bb004bb56b11fb30d98b6c1b4718579d"}, + {file = "pyarrow-21.0.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:dc56bc708f2d8ac71bd1dcb927e458c93cec10b98eb4120206a4091db7b67b99"}, + {file = "pyarrow-21.0.0-cp313-cp313-win_amd64.whl", hash = "sha256:186aa00bca62139f75b7de8420f745f2af12941595bbbfa7ed3870ff63e25636"}, + {file = "pyarrow-21.0.0-cp313-cp313t-macosx_12_0_arm64.whl", hash = "sha256:a7a102574faa3f421141a64c10216e078df467ab9576684d5cd696952546e2da"}, + {file = "pyarrow-21.0.0-cp313-cp313t-macosx_12_0_x86_64.whl", hash = "sha256:1e005378c4a2c6db3ada3ad4c217b381f6c886f0a80d6a316fe586b90f77efd7"}, + {file = "pyarrow-21.0.0-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:65f8e85f79031449ec8706b74504a316805217b35b6099155dd7e227eef0d4b6"}, + {file = "pyarrow-21.0.0-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:3a81486adc665c7eb1a2bde0224cfca6ceaba344a82a971ef059678417880eb8"}, + {file = "pyarrow-21.0.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:fc0d2f88b81dcf3ccf9a6ae17f89183762c8a94a5bdcfa09e05cfe413acf0503"}, + {file = "pyarrow-21.0.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:6299449adf89df38537837487a4f8d3bd91ec94354fdd2a7d30bc11c48ef6e79"}, + {file = "pyarrow-21.0.0-cp313-cp313t-win_amd64.whl", hash = "sha256:222c39e2c70113543982c6b34f3077962b44fca38c0bd9e68bb6781534425c10"}, + {file = "pyarrow-21.0.0-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:a7f6524e3747e35f80744537c78e7302cd41deee8baa668d56d55f77d9c464b3"}, + {file = "pyarrow-21.0.0-cp39-cp39-macosx_12_0_x86_64.whl", hash = "sha256:203003786c9fd253ebcafa44b03c06983c9c8d06c3145e37f1b76a1f317aeae1"}, + {file = "pyarrow-21.0.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:3b4d97e297741796fead24867a8dabf86c87e4584ccc03167e4a811f50fdf74d"}, + {file = "pyarrow-21.0.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:898afce396b80fdda05e3086b4256f8677c671f7b1d27a6976fa011d3fd0a86e"}, + {file = "pyarrow-21.0.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:067c66ca29aaedae08218569a114e413b26e742171f526e828e1064fcdec13f4"}, + {file = "pyarrow-21.0.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:0c4e75d13eb76295a49e0ea056eb18dbd87d81450bfeb8afa19a7e5a75ae2ad7"}, + {file = "pyarrow-21.0.0-cp39-cp39-win_amd64.whl", hash = "sha256:cdc4c17afda4dab2a9c0b79148a43a7f4e1094916b3e18d8975bfd6d6d52241f"}, + {file = "pyarrow-21.0.0.tar.gz", hash = "sha256:5051f2dccf0e283ff56335760cbc8622cf52264d67e359d5569541ac11b6d5bc"}, +] + +[package.extras] +test = ["cffi", "hypothesis", "pandas", "pytest", "pytz"] + [[package]] name = "pycodestyle" version = "2.9.1" @@ -1398,4 +1454,4 @@ zstd = ["zstandard (>=0.18.0)"] [metadata] lock-version = "2.1" python-versions = "^3.9" -content-hash = "809134922cb291c65292513ace5efb024a705e81bd0be88c69cdc66efa120071" +content-hash = "92f274ea55d4fc180cd962e9c4da63df36472b7f967ab06ca38527468dbd7ea7" diff --git a/pyproject.toml b/pyproject.toml index b99911e2..edc2b189 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -43,6 +43,7 @@ uplink = [ pydantic = "^2.11.3" pyyaml = "^6.0.1" pandas = "^2.1.0" +pyarrow = "^21.0.0" [tool.poetry.group.dev.dependencies] black = ">=22.10,<25.0" From 04959667caf65a1c4db72af966c475a2fa86d002 Mon Sep 17 00:00:00 2001 From: Cameron Waterman Date: Mon, 8 Sep 2025 15:20:27 -0500 Subject: [PATCH 02/28] iterating --- .../clients/dataframe/_data_frame_client.py | 165 ++++++++++++------ 1 file changed, 114 insertions(+), 51 deletions(-) diff --git a/nisystemlink/clients/dataframe/_data_frame_client.py b/nisystemlink/clients/dataframe/_data_frame_client.py index 22469883..d58bd68b 100644 --- a/nisystemlink/clients/dataframe/_data_frame_client.py +++ b/nisystemlink/clients/dataframe/_data_frame_client.py @@ -1,9 +1,12 @@ """Implementation of DataFrameClient.""" -import pyarrow as pa +try: # Optional pyarrow dependency + import pyarrow as pa # type: ignore +except Exception: # pragma: no cover - pyarrow not installed + pa = None # type: ignore from collections.abc import Iterable from io import BytesIO -from typing import List, Optional +from typing import Any, List, Optional, Union from nisystemlink.clients import core from nisystemlink.clients.core._uplink._base_client import BaseClient @@ -252,74 +255,134 @@ def get_table_data( """ ... - @post("tables/{id}/data", args=[Path, Body]) - def append_table_data(self, id: str, data: models.AppendTableDataRequest) -> None: - """Appends one or more rows of data to the table identified by its ID. - - Args: - id: Unique ID of a data table. - data: The rows of data to append and any additional options. + @post( + "tables/{id}/data", + args=[Path, Body] + ) + def _append_table_data_json( + self, id: str, data: models.AppendTableDataRequest + ) -> None: + ... - Raises: - ApiException: if unable to communicate with the DataFrame Service - or provided an invalid argument. - """ + @post( + "tables/{id}/data", + args=[Path, Body, Query], + content_type="application/vnd.apache.arrow.stream", + ) + def _append_table_data_arrow( + self, id: str, data: Iterable[bytes], end_of_data: bool + ) -> None: ... - def append_table_data2( + def append_table_data( self, id: str, - data: Iterable[pa.RecordBatch], - end_of_data: bool = False, + data: Optional[ + Union[ + models.AppendTableDataRequest, + models.DataFrame, + Iterable["pa.RecordBatch"], # type: ignore[name-defined] + ] + ], + *, + end_of_data: Optional[bool] = None, ) -> None: """Appends one or more rows of data to the table identified by its ID. + This method accepts: + - None: ``end_of_data`` must be provided. Sends JSON with only the ``endOfData`` flag. + - DataFrame (service model): Wrapped into an AppendTableDataRequest (with optional ``end_of_data``) and sent as JSON. + - AppendTableDataRequest: Sent as-is via JSON. ``end_of_data`` must be ``None``. + - Iterable[pyarrow.RecordBatch]: Streamed as Arrow IPC. ``end_of_data`` (if provided) is sent as a query parameter. If the iterable yields no batches, behaves like the ``None`` case (thus requiring ``end_of_data``). + Args: id: Unique ID of a data table. - data: The data to append. + data: The data to append in one of the supported forms. end_of_data: Whether the table should expect any additional rows to be - appended in future requests. + appended in future requests. Required when ``data`` is ``None`` or + an empty iterator of RecordBatches. Must be omitted when ``data`` is + an ``AppendTableDataRequest`` (provide it inside the request model). Raises: - ApiException: if unable to communicate with the DataFrame Service - or provided an invalid argument. + ValueError: If parameter constraints are violated. + ApiException: If unable to communicate with the DataFrame Service + or an invalid argument is provided. """ - def _generate_body() -> Iterable[memoryview]: - data_iter = iter(data) + if isinstance(data, models.AppendTableDataRequest): + if end_of_data is not None: + raise ValueError( + "end_of_data must not be provided separately when passing an AppendTableDataRequest." + ) + self._append_table_data_json(id, data) + return + + if isinstance(data, models.DataFrame): + self._append_table_data_json( + id, models.AppendTableDataRequest(frame=data, end_of_data=end_of_data) + ) + return + + if isinstance(data, Iterable): + iterator = iter(data) try: - batch = next(data_iter) + first_batch = next(iterator) except StopIteration: + if end_of_data is None: + raise ValueError( + "end_of_data must be provided when data iterator is empty." + ) + self._append_table_data_json( + id, models.AppendTableDataRequest(end_of_data=end_of_data) + ) return - with BytesIO() as buf: - options = pa.ipc.IpcWriteOptions(compression="zstd") - writer = pa.ipc.new_stream(buf, batch.schema, options=options) - - while True: - writer.write_batch(batch) - with buf.getbuffer() as view, view[0 : buf.tell()] as slice: - yield slice - buf.seek(0) - try: - batch = next(data_iter) - except StopIteration: - break - - writer.close() - with buf.getbuffer() as view: - with view[0 : buf.tell()] as slice: - yield slice - - return self.__append_table_data_arrow(id, _generate_body(), end_of_data) - @post( - "tables/{id}/data", - args=[Path, Body, Query], - content_type="application/vnd.apache.arrow.stream", - ) - def __append_table_data_arrow( - self, id: str, data: Iterable[bytes], end_of_data: bool - ) -> None: ... + if pa is None: + raise RuntimeError( + "pyarrow is not installed. Install with the appropriate extra to stream RecordBatches." + ) + + if not isinstance(first_batch, pa.RecordBatch): + raise ValueError( + "Iterable provided to data must yield pyarrow.RecordBatch objects." + ) + + def _generate_body() -> Iterable[memoryview]: + data_iter = (b for b in (first_batch, *iterator)) + first = True + with BytesIO() as buf: + writer = None + for batch in data_iter: + if first: + options = pa.ipc.IpcWriteOptions(compression="zstd") + writer = pa.ipc.new_stream(buf, batch.schema, options=options) + first = False + writer.write_batch(batch) + with buf.getbuffer() as view, view[0 : buf.tell()] as slice: + yield slice + buf.seek(0) + if writer is not None: + writer.close() + with buf.getbuffer() as view: + with view[0 : buf.tell()] as slice: + yield slice + + self._append_table_data_arrow(id, _generate_body(), end_of_data or False) + return + + if data is None: + if end_of_data is None: + raise ValueError( + "end_of_data must be provided when data is None (no rows to append)." + ) + self._append_table_data_json( + id, models.AppendTableDataRequest(end_of_data=end_of_data) + ) + return + + raise ValueError( + "Unsupported type for data. Expected AppendTableDataRequest, DataFrame, Iterable[RecordBatch], or None." + ) @post("tables/{id}/query-data", args=[Path, Body]) def query_table_data( From 7dcc639af3565937db8ab56a5b84f30d9e6e68f9 Mon Sep 17 00:00:00 2001 From: Cameron Waterman Date: Mon, 8 Sep 2025 16:01:15 -0500 Subject: [PATCH 03/28] iterating --- nisystemlink/clients/dataframe/_data_frame_client.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/nisystemlink/clients/dataframe/_data_frame_client.py b/nisystemlink/clients/dataframe/_data_frame_client.py index d58bd68b..b026c1b0 100644 --- a/nisystemlink/clients/dataframe/_data_frame_client.py +++ b/nisystemlink/clients/dataframe/_data_frame_client.py @@ -289,11 +289,11 @@ def append_table_data( ) -> None: """Appends one or more rows of data to the table identified by its ID. - This method accepts: - - None: ``end_of_data`` must be provided. Sends JSON with only the ``endOfData`` flag. - - DataFrame (service model): Wrapped into an AppendTableDataRequest (with optional ``end_of_data``) and sent as JSON. - - AppendTableDataRequest: Sent as-is via JSON. ``end_of_data`` must be ``None``. - - Iterable[pyarrow.RecordBatch]: Streamed as Arrow IPC. ``end_of_data`` (if provided) is sent as a query parameter. If the iterable yields no batches, behaves like the ``None`` case (thus requiring ``end_of_data``). + This method accepts: + - AppendTableDataRequest: Sent as-is via JSON. ``end_of_data`` must be ``None``. + - DataFrame (service model): Wrapped into an AppendTableDataRequest (with optional ``end_of_data``) and sent as JSON. + - Iterable[pyarrow.RecordBatch]: Streamed as Arrow IPC. ``end_of_data`` (if provided) is sent as a query parameter. If the iterable yields no batches, behaves like the ``None`` case (thus requiring ``end_of_data``). + - None: ``end_of_data`` must be provided. Sends JSON with only the ``endOfData`` flag. Args: id: Unique ID of a data table. @@ -339,7 +339,7 @@ def append_table_data( if pa is None: raise RuntimeError( - "pyarrow is not installed. Install with the appropriate extra to stream RecordBatches." + "pyarrow is not installed. Install to stream RecordBatches." ) if not isinstance(first_batch, pa.RecordBatch): From 29b929976372443352ab40b4a0e5265ebb556614 Mon Sep 17 00:00:00 2001 From: Cameron Waterman Date: Tue, 9 Sep 2025 08:36:15 -0500 Subject: [PATCH 04/28] Update _data_frame_client.py error handling and comments --- .../clients/dataframe/_data_frame_client.py | 36 +++++++++++-------- 1 file changed, 22 insertions(+), 14 deletions(-) diff --git a/nisystemlink/clients/dataframe/_data_frame_client.py b/nisystemlink/clients/dataframe/_data_frame_client.py index b026c1b0..f1642a8b 100644 --- a/nisystemlink/clients/dataframe/_data_frame_client.py +++ b/nisystemlink/clients/dataframe/_data_frame_client.py @@ -289,24 +289,18 @@ def append_table_data( ) -> None: """Appends one or more rows of data to the table identified by its ID. - This method accepts: - - AppendTableDataRequest: Sent as-is via JSON. ``end_of_data`` must be ``None``. - - DataFrame (service model): Wrapped into an AppendTableDataRequest (with optional ``end_of_data``) and sent as JSON. - - Iterable[pyarrow.RecordBatch]: Streamed as Arrow IPC. ``end_of_data`` (if provided) is sent as a query parameter. If the iterable yields no batches, behaves like the ``None`` case (thus requiring ``end_of_data``). - - None: ``end_of_data`` must be provided. Sends JSON with only the ``endOfData`` flag. - Args: id: Unique ID of a data table. - data: The data to append in one of the supported forms. - end_of_data: Whether the table should expect any additional rows to be - appended in future requests. Required when ``data`` is ``None`` or - an empty iterator of RecordBatches. Must be omitted when ``data`` is - an ``AppendTableDataRequest`` (provide it inside the request model). + data: The data to append in one of the following forms: + - AppendTableDataRequest: Sent as-is via JSON. ``end_of_data`` must be ``None``. + - DataFrame (service model): Wrapped into an AppendTableDataRequest (``end_of_data`` optional) and sent as JSON. + - Iterable[pyarrow.RecordBatch]: Streamed as Arrow IPC. ``end_of_data`` (if provided) is sent as a query parameter. If the iterator yields no batches, it is treated like ``None`` and requires ``end_of_data``. + - None: ``end_of_data`` must be provided; sends JSON containing only the ``endOfData`` flag. + end_of_data: Whether the table should expect any additional rows to be appended in future requests. Required when ``data`` is ``None`` or an empty RecordBatch iterator; must be omitted when supplying an ``AppendTableDataRequest`` (put it inside that model instead). Raises: ValueError: If parameter constraints are violated. - ApiException: If unable to communicate with the DataFrame Service - or an invalid argument is provided. + ApiException: If unable to communicate with the DataFrame Service or an invalid argument is provided. """ if isinstance(data, models.AppendTableDataRequest): @@ -367,7 +361,21 @@ def _generate_body() -> Iterable[memoryview]: with view[0 : buf.tell()] as slice: yield slice - self._append_table_data_arrow(id, _generate_body(), end_of_data or False) + try: + self._append_table_data_arrow(id, _generate_body(), end_of_data or False) + except core.ApiException as ex: + # TODO: Maybe we should also check the error code in addition to the response status code? + if ex.http_status_code == 400: + raise core.ApiException( + ( + "Arrow ingestion request was rejected. The target DataFrame Service doesn't support Arrow streaming. " + "Install a DataFrame Service version with Arrow support or fall back to JSON ingestion." + ), + error=ex.error, + http_status_code=ex.http_status_code, + inner=ex, + ) from ex + raise return if data is None: From a9fcfc8b2f8ed3ae7069ac3d1cc7968a447fcc1f Mon Sep 17 00:00:00 2001 From: Cameron Waterman Date: Tue, 9 Sep 2025 13:59:59 -0500 Subject: [PATCH 05/28] Error handling / tests --- .../clients/dataframe/_data_frame_client.py | 30 +- .../test_append_table_data_polymorphic.py | 317 ++++++++++++++++++ 2 files changed, 337 insertions(+), 10 deletions(-) create mode 100644 tests/integration/dataframe/test_append_table_data_polymorphic.py diff --git a/nisystemlink/clients/dataframe/_data_frame_client.py b/nisystemlink/clients/dataframe/_data_frame_client.py index f1642a8b..bfc3a0a6 100644 --- a/nisystemlink/clients/dataframe/_data_frame_client.py +++ b/nisystemlink/clients/dataframe/_data_frame_client.py @@ -364,17 +364,27 @@ def _generate_body() -> Iterable[memoryview]: try: self._append_table_data_arrow(id, _generate_body(), end_of_data or False) except core.ApiException as ex: - # TODO: Maybe we should also check the error code in addition to the response status code? if ex.http_status_code == 400: - raise core.ApiException( - ( - "Arrow ingestion request was rejected. The target DataFrame Service doesn't support Arrow streaming. " - "Install a DataFrame Service version with Arrow support or fall back to JSON ingestion." - ), - error=ex.error, - http_status_code=ex.http_status_code, - inner=ex, - ) from ex + wrap = True + try: + info = self.api_info() + # write_data attribute (camelCase writeData in JSON) denotes supported write version + write_op = getattr(info.operations, "write_data", None) + if write_op is not None and getattr(write_op, "version", 0) >= 2: + # Service claims Arrow-capable write version; re-raise original exception + wrap = False + except Exception: # pragma: no cover - fallback to wrapping + pass + if wrap: + raise core.ApiException( + ( + "Arrow ingestion request was rejected. The target DataFrame Service doesn't support Arrow streaming. " + "Install a DataFrame Service version with Arrow support or fall back to JSON ingestion." + ), + error=ex.error, + http_status_code=ex.http_status_code, + inner=ex, + ) from ex raise return diff --git a/tests/integration/dataframe/test_append_table_data_polymorphic.py b/tests/integration/dataframe/test_append_table_data_polymorphic.py new file mode 100644 index 00000000..8e943fcb --- /dev/null +++ b/tests/integration/dataframe/test_append_table_data_polymorphic.py @@ -0,0 +1,317 @@ +import itertools +from typing import Iterable, List, Optional + +from nisystemlink.clients.dataframe.models._column import Column +from nisystemlink.clients.dataframe.models._column_type import ColumnType +from nisystemlink.clients.dataframe.models._create_table_request import CreateTableRequest +from nisystemlink.clients.dataframe.models._data_type import DataType +import pytest # type: ignore +import responses +from responses import matchers + +from nisystemlink.clients.core import ApiException +from nisystemlink.clients.dataframe import DataFrameClient +from nisystemlink.clients.dataframe.models import AppendTableDataRequest, DataFrame + +int_index_column = Column( + name="index", data_type=DataType.Int32, column_type=ColumnType.Index +) + +basic_table_model = CreateTableRequest(columns=[int_index_column]) + + +@pytest.fixture(scope="class") +def client(enterprise_config): + """Fixture to create a DataFrameClient instance.""" + return DataFrameClient(enterprise_config) + + +@pytest.fixture(scope="class") +def create_table(client: DataFrameClient): + """Fixture to return a factory that creates tables.""" + tables = [] + + def _create_table(table: Optional[CreateTableRequest] = None) -> str: + id = client.create_table(table or basic_table_model) + tables.append(id) + return id + + yield _create_table + + client.delete_tables(tables) + + +@pytest.fixture(scope="class") +def test_tables(create_table): + """Fixture to create a set of test tables.""" + ids = [] + for i in range(1, 4): + ids.append( + create_table( + CreateTableRequest( + columns=[ + Column( + name="time", + data_type=DataType.Timestamp, + column_type=ColumnType.Index, + properties={"cat": "meow"}, + ), + Column(name="value", data_type=DataType.Int32), + ], + name=f"Python API test table {i} (delete me)", + properties={"dog": "woof"}, + ) + ) + ) + return ids + + +@pytest.mark.enterprise +@pytest.mark.integration +def test__append_table_data__append_request_success( + client: DataFrameClient, test_tables: List[str] +): + frame = DataFrame(data=[["1"], ["2"]]) + request = AppendTableDataRequest(frame=frame, end_of_data=True) + with responses.RequestsMock() as rsps: + rsps.add( + responses.POST, + f"{client.session.base_url}tables/{test_tables[0]}/data", + match=[ + matchers.json_params_matcher( + {"frame": {"data": [["1"], ["2"]]}, "endOfData": True} + ) + ], + json={}, + ) + client.append_table_data(test_tables[0], request) + + +@pytest.mark.enterprise +@pytest.mark.integration +def test__append_table_data__append_request_with_end_of_data_argument_disallowed( + client: DataFrameClient, +): + request = AppendTableDataRequest(end_of_data=True) + with pytest.raises(ValueError, match="end_of_data must not be provided separately when passing an AppendTableDataRequest."): + client.append_table_data(test_tables[0], request, end_of_data=True) + + +@pytest.mark.enterprise +@pytest.mark.integration +def test__append_table_data__accepts_dataframe_model(client: DataFrameClient, test_tables: List[str]): + frame = DataFrame(data=[["1"], ["2"]]) + with responses.RequestsMock() as rsps: + rsps.add( + responses.POST, + f"{client.session.base_url}tables/{test_tables[0]}/data", + match=[ + matchers.json_params_matcher( + {"frame": {"data": [["1"], ["2"]]}, "endOfData": True} + ) + ], + json={}, + ) + assert ( + client.append_table_data(test_tables[0], frame, end_of_data=True) is None + ) + + +@pytest.mark.enterprise +@pytest.mark.integration +def test__append_table_data__flush_only_with_none(client: DataFrameClient): + with responses.RequestsMock() as rsps: + rsps.add( + responses.POST, + f"{client.session.base_url}tables/{test_tables[0]}/data", + match=[matchers.json_params_matcher({"endOfData": True})], + json={}, + ) + client.append_table_data(test_tables[0], None, end_of_data=True) + + +@pytest.mark.enterprise +@pytest.mark.integration +def test__append_table_data__empty_iterator_requires_end_of_data(client: DataFrameClient): + # No end_of_data -> ValueError + with pytest.raises(ValueError, match="end_of_data must be provided when data iterator is empty."): + client.append_table_data(test_tables[0], []) + + # With end_of_data -> sends JSON flush only + with responses.RequestsMock() as rsps: + rsps.add( + responses.POST, + f"{client.session.base_url}tables/{test_tables[0]}/data", + match=[matchers.json_params_matcher({"endOfData": True})], + json={}, + ) + client.append_table_data(test_tables[0], [], end_of_data=True) + + +@pytest.mark.enterprise +@pytest.mark.integration +def test__append_table_data__arrow_iterable_without_pyarrow_raises_runtime_error( + client: DataFrameClient, test_tables: List[str], monkeypatch +): + # Simulate pyarrow not installed by setting module-level pa to None + import nisystemlink.clients.dataframe._data_frame_client as df_module + + monkeypatch.setattr(df_module, "pa", None) + + with pytest.raises(RuntimeError, match="pyarrow is not installed. Install to stream RecordBatches."): + client.append_table_data(test_tables[0], [object()]) + + +@pytest.mark.enterprise +@pytest.mark.integration +def test__append_table_data__arrow_iterable_with_non_recordbatch_elements_raises( + client: DataFrameClient, test_tables: List[str] +): + pytest.importorskip("pyarrow") + # Pass a list whose first element is not a RecordBatch -> should raise ValueError + with pytest.raises(ValueError, match="Iterable provided to data must yield pyarrow.RecordBatch objects."): + client.append_table_data(test_tables[0], [1, 2, 3]) + + +def _arrow_record_batches() -> Iterable["pa.RecordBatch"]: # type: ignore[name-defined] + import pyarrow as pa # type: ignore + + batch = pa.record_batch([pa.array([1, 2, 3])], names=["a"]) + return [batch] + + +@pytest.mark.enterprise +@pytest.mark.integration +def test__append_table_data__arrow_ingestion_400_wrapped_error(client: DataFrameClient): + pa = pytest.importorskip("pyarrow") # noqa: F841 + + with responses.RequestsMock() as rsps: + # Mock api_info indicating writeData version 1 (no Arrow support) + rsps.add( + responses.GET, + f"{client.session.base_url}", + json={ + "operations": { + "createTables": {"available": True, "version": 1}, + "deleteTables": {"available": True, "version": 1}, + "modifyMetadata": {"available": True, "version": 1}, + "listTables": {"available": True, "version": 1}, + "readData": {"available": True, "version": 1}, + "writeData": {"available": True, "version": 1}, + } + }, + ) + def _callback(request): # type: ignore + return (400, {}, "") + + rsps.add_callback( + responses.POST, + f"{client.session.base_url}tables/{test_tables[0]}/data", + callback=_callback, + content_type="application/vnd.apache.arrow.stream", + ) + + with pytest.raises(ApiException, match="Arrow ingestion request was rejected"): + client.append_table_data(test_tables[0], _arrow_record_batches()) + + +@pytest.mark.enterprise +@pytest.mark.integration +def test__append_table_data__arrow_ingestion_400_not_wrapped_when_version2( + client: DataFrameClient, test_tables: List[str] +): + pa = pytest.importorskip("pyarrow") # noqa: F841 + + with responses.RequestsMock() as rsps: + # Mock api_info indicating writeData version 2 (Arrow support); expect original 400 (no friendly message) + rsps.add( + responses.GET, + f"{client.session.base_url}", + json={ + "operations": { + "createTables": {"available": True, "version": 1}, + "deleteTables": {"available": True, "version": 1}, + "modifyMetadata": {"available": True, "version": 1}, + "listTables": {"available": True, "version": 1}, + "readData": {"available": True, "version": 1}, + "writeData": {"available": True, "version": 2}, + } + }, + ) + + def _callback(request): # type: ignore + return (400, {}, "") + + rsps.add_callback( + responses.POST, + f"{client.session.base_url}tables/{test_tables[0]}/data", + callback=_callback, + content_type="application/vnd.apache.arrow.stream", + ) + + with pytest.raises(ApiException) as excinfo: + client.append_table_data(test_tables[0], _arrow_record_batches()) + # Ensure friendly message NOT used + assert "doesn't support Arrow streaming" not in str(excinfo.value) + + +@pytest.mark.enterprise +@pytest.mark.integration +def test__append_table_data__arrow_ingestion_400_api_info_failure_wrapped( + client: DataFrameClient, test_tables: List[str] +): + pa = pytest.importorskip("pyarrow") # noqa: F841 + + with responses.RequestsMock() as rsps: + # Simulate api_info failure (500) so fallback wrapping should occur + rsps.add( + responses.GET, + f"{client.session.base_url}", + status=500, + json={"error": "server error"}, + ) + + def _callback(request): # type: ignore + return (400, {}, "") + + rsps.add_callback( + responses.POST, + f"{client.session.base_url}tables/{test_tables[0]}/data", + callback=_callback, + content_type="application/vnd.apache.arrow.stream", + ) + + with pytest.raises(ApiException, match="Arrow ingestion request was rejected"): + client.append_table_data(test_tables[0], _arrow_record_batches()) + + +@pytest.mark.enterprise +@pytest.mark.integration +def test__append_table_data__arrow_ingestion_success(client: DataFrameClient): + pa = pytest.importorskip("pyarrow") # noqa: F841 + + with responses.RequestsMock() as rsps: + # Use callback to assert content-type header (can't easily JSON match binary stream) + def _callback(request): # type: ignore + assert ( + request.headers.get("Content-Type") + == "application/vnd.apache.arrow.stream" + ) + return (200, {}, "") + + rsps.add_callback( + responses.POST, + f"{client.session.base_url}tables/{test_tables[0]}/data", + callback=_callback, + content_type="application/vnd.apache.arrow.stream", + ) + + # Provide iterator of record batches + client.append_table_data(test_tables[0], _arrow_record_batches()) + + +@pytest.mark.enterprise +@pytest.mark.integration +def test__append_table_data__unsupported_type_raises(client: DataFrameClient): + with pytest.raises(ValueError, match="Unsupported type"): + client.append_table_data(test_tables[0], 123) # type: ignore[arg-type] From 8c89e58a12e37d300a03205a6e44e25aa0ebfa12 Mon Sep 17 00:00:00 2001 From: Cameron Waterman Date: Tue, 9 Sep 2025 14:21:25 -0500 Subject: [PATCH 06/28] Update test_append_table_data_polymorphic.py Iterating on tests --- .../test_append_table_data_polymorphic.py | 83 +++++++++++++++++-- 1 file changed, 77 insertions(+), 6 deletions(-) diff --git a/tests/integration/dataframe/test_append_table_data_polymorphic.py b/tests/integration/dataframe/test_append_table_data_polymorphic.py index 8e943fcb..94ba6710 100644 --- a/tests/integration/dataframe/test_append_table_data_polymorphic.py +++ b/tests/integration/dataframe/test_append_table_data_polymorphic.py @@ -90,7 +90,7 @@ def test__append_table_data__append_request_success( @pytest.mark.enterprise @pytest.mark.integration def test__append_table_data__append_request_with_end_of_data_argument_disallowed( - client: DataFrameClient, + client: DataFrameClient, test_tables: List[str] ): request = AppendTableDataRequest(end_of_data=True) with pytest.raises(ValueError, match="end_of_data must not be provided separately when passing an AppendTableDataRequest."): @@ -116,10 +116,33 @@ def test__append_table_data__accepts_dataframe_model(client: DataFrameClient, te client.append_table_data(test_tables[0], frame, end_of_data=True) is None ) +@pytest.mark.enterprise +@pytest.mark.integration +def test__append_table_data__dataframe_without_end_of_data_success( + client: DataFrameClient, test_tables: List[str] +): + frame = DataFrame(data=[["10"], ["11"]]) + with responses.RequestsMock() as rsps: + rsps.add( + responses.POST, + f"{client.session.base_url}tables/{test_tables[0]}/data", + json={}, + ) + client.append_table_data(test_tables[0], frame) # no end_of_data + @pytest.mark.enterprise @pytest.mark.integration -def test__append_table_data__flush_only_with_none(client: DataFrameClient): +def test__append_table_data__none_without_end_of_data_raises( + client: DataFrameClient, test_tables: List[str] +): + with pytest.raises(ValueError, match="end_of_data must be provided when data is None"): + client.append_table_data(test_tables[0], None) + + +@pytest.mark.enterprise +@pytest.mark.integration +def test__append_table_data__flush_only_with_none(client: DataFrameClient, test_tables: List[str]): with responses.RequestsMock() as rsps: rsps.add( responses.POST, @@ -132,7 +155,7 @@ def test__append_table_data__flush_only_with_none(client: DataFrameClient): @pytest.mark.enterprise @pytest.mark.integration -def test__append_table_data__empty_iterator_requires_end_of_data(client: DataFrameClient): +def test__append_table_data__empty_iterator_requires_end_of_data(client: DataFrameClient, test_tables: List[str]): # No end_of_data -> ValueError with pytest.raises(ValueError, match="end_of_data must be provided when data iterator is empty."): client.append_table_data(test_tables[0], []) @@ -182,7 +205,7 @@ def _arrow_record_batches() -> Iterable["pa.RecordBatch"]: # type: ignore[name- @pytest.mark.enterprise @pytest.mark.integration -def test__append_table_data__arrow_ingestion_400_wrapped_error(client: DataFrameClient): +def test__append_table_data__arrow_ingestion_400_wrapped_error(client: DataFrameClient, test_tables: List[str]): pa = pytest.importorskip("pyarrow") # noqa: F841 with responses.RequestsMock() as rsps: @@ -287,7 +310,55 @@ def _callback(request): # type: ignore @pytest.mark.enterprise @pytest.mark.integration -def test__append_table_data__arrow_ingestion_success(client: DataFrameClient): +def test__append_table_data__arrow_ingestion_non_400_error_passthrough( + client: DataFrameClient, test_tables: List[str] +): + pa = pytest.importorskip("pyarrow") # noqa: F841 + + with responses.RequestsMock() as rsps: + def _callback(request): # type: ignore + return (415, {}, "") # Unsupported Media Type + + rsps.add_callback( + responses.POST, + f"{client.session.base_url}tables/{test_tables[0]}/data", + callback=_callback, + content_type="application/vnd.apache.arrow.stream", + ) + + with pytest.raises(ApiException) as excinfo: + client.append_table_data(test_tables[0], _arrow_record_batches()) + assert "doesn't support Arrow streaming" not in str(excinfo.value) + assert "415" in str(excinfo.value) + + +@pytest.mark.enterprise +@pytest.mark.integration +def test__append_table_data__arrow_ingestion_with_end_of_data_query_param( + client: DataFrameClient, test_tables: List[str] +): + pa = pytest.importorskip("pyarrow") # noqa: F841 + + with responses.RequestsMock() as rsps: + def _callback(request): # type: ignore + url = request.url.lower() + # Allow either endOfData or end_of_data just in case + assert ("endofdata=true" in url) or ("end_of_data=true" in url) + return (200, {}, "") + + rsps.add_callback( + responses.POST, + f"{client.session.base_url}tables/{test_tables[0]}/data", + callback=_callback, + content_type="application/vnd.apache.arrow.stream", + ) + + client.append_table_data(test_tables[0], _arrow_record_batches(), end_of_data=True) + + +@pytest.mark.enterprise +@pytest.mark.integration +def test__append_table_data__arrow_ingestion_success(client: DataFrameClient, test_tables: List[str]): pa = pytest.importorskip("pyarrow") # noqa: F841 with responses.RequestsMock() as rsps: @@ -312,6 +383,6 @@ def _callback(request): # type: ignore @pytest.mark.enterprise @pytest.mark.integration -def test__append_table_data__unsupported_type_raises(client: DataFrameClient): +def test__append_table_data__unsupported_type_raises(client: DataFrameClient, test_tables: List[str]): with pytest.raises(ValueError, match="Unsupported type"): client.append_table_data(test_tables[0], 123) # type: ignore[arg-type] From 0ec37821c005057d8f89847309986171c3801529 Mon Sep 17 00:00:00 2001 From: Cameron Waterman Date: Tue, 9 Sep 2025 15:11:43 -0500 Subject: [PATCH 07/28] cleanup --- .../clients/dataframe/_data_frame_client.py | 3 +- .../test_append_table_data_polymorphic.py | 388 ------------------ tests/integration/dataframe/test_dataframe.py | 274 ++++++++++++- 3 files changed, 274 insertions(+), 391 deletions(-) delete mode 100644 tests/integration/dataframe/test_append_table_data_polymorphic.py diff --git a/nisystemlink/clients/dataframe/_data_frame_client.py b/nisystemlink/clients/dataframe/_data_frame_client.py index bfc3a0a6..13021e35 100644 --- a/nisystemlink/clients/dataframe/_data_frame_client.py +++ b/nisystemlink/clients/dataframe/_data_frame_client.py @@ -368,12 +368,11 @@ def _generate_body() -> Iterable[memoryview]: wrap = True try: info = self.api_info() - # write_data attribute (camelCase writeData in JSON) denotes supported write version write_op = getattr(info.operations, "write_data", None) if write_op is not None and getattr(write_op, "version", 0) >= 2: # Service claims Arrow-capable write version; re-raise original exception wrap = False - except Exception: # pragma: no cover - fallback to wrapping + except Exception: pass if wrap: raise core.ApiException( diff --git a/tests/integration/dataframe/test_append_table_data_polymorphic.py b/tests/integration/dataframe/test_append_table_data_polymorphic.py deleted file mode 100644 index 94ba6710..00000000 --- a/tests/integration/dataframe/test_append_table_data_polymorphic.py +++ /dev/null @@ -1,388 +0,0 @@ -import itertools -from typing import Iterable, List, Optional - -from nisystemlink.clients.dataframe.models._column import Column -from nisystemlink.clients.dataframe.models._column_type import ColumnType -from nisystemlink.clients.dataframe.models._create_table_request import CreateTableRequest -from nisystemlink.clients.dataframe.models._data_type import DataType -import pytest # type: ignore -import responses -from responses import matchers - -from nisystemlink.clients.core import ApiException -from nisystemlink.clients.dataframe import DataFrameClient -from nisystemlink.clients.dataframe.models import AppendTableDataRequest, DataFrame - -int_index_column = Column( - name="index", data_type=DataType.Int32, column_type=ColumnType.Index -) - -basic_table_model = CreateTableRequest(columns=[int_index_column]) - - -@pytest.fixture(scope="class") -def client(enterprise_config): - """Fixture to create a DataFrameClient instance.""" - return DataFrameClient(enterprise_config) - - -@pytest.fixture(scope="class") -def create_table(client: DataFrameClient): - """Fixture to return a factory that creates tables.""" - tables = [] - - def _create_table(table: Optional[CreateTableRequest] = None) -> str: - id = client.create_table(table or basic_table_model) - tables.append(id) - return id - - yield _create_table - - client.delete_tables(tables) - - -@pytest.fixture(scope="class") -def test_tables(create_table): - """Fixture to create a set of test tables.""" - ids = [] - for i in range(1, 4): - ids.append( - create_table( - CreateTableRequest( - columns=[ - Column( - name="time", - data_type=DataType.Timestamp, - column_type=ColumnType.Index, - properties={"cat": "meow"}, - ), - Column(name="value", data_type=DataType.Int32), - ], - name=f"Python API test table {i} (delete me)", - properties={"dog": "woof"}, - ) - ) - ) - return ids - - -@pytest.mark.enterprise -@pytest.mark.integration -def test__append_table_data__append_request_success( - client: DataFrameClient, test_tables: List[str] -): - frame = DataFrame(data=[["1"], ["2"]]) - request = AppendTableDataRequest(frame=frame, end_of_data=True) - with responses.RequestsMock() as rsps: - rsps.add( - responses.POST, - f"{client.session.base_url}tables/{test_tables[0]}/data", - match=[ - matchers.json_params_matcher( - {"frame": {"data": [["1"], ["2"]]}, "endOfData": True} - ) - ], - json={}, - ) - client.append_table_data(test_tables[0], request) - - -@pytest.mark.enterprise -@pytest.mark.integration -def test__append_table_data__append_request_with_end_of_data_argument_disallowed( - client: DataFrameClient, test_tables: List[str] -): - request = AppendTableDataRequest(end_of_data=True) - with pytest.raises(ValueError, match="end_of_data must not be provided separately when passing an AppendTableDataRequest."): - client.append_table_data(test_tables[0], request, end_of_data=True) - - -@pytest.mark.enterprise -@pytest.mark.integration -def test__append_table_data__accepts_dataframe_model(client: DataFrameClient, test_tables: List[str]): - frame = DataFrame(data=[["1"], ["2"]]) - with responses.RequestsMock() as rsps: - rsps.add( - responses.POST, - f"{client.session.base_url}tables/{test_tables[0]}/data", - match=[ - matchers.json_params_matcher( - {"frame": {"data": [["1"], ["2"]]}, "endOfData": True} - ) - ], - json={}, - ) - assert ( - client.append_table_data(test_tables[0], frame, end_of_data=True) is None - ) - -@pytest.mark.enterprise -@pytest.mark.integration -def test__append_table_data__dataframe_without_end_of_data_success( - client: DataFrameClient, test_tables: List[str] -): - frame = DataFrame(data=[["10"], ["11"]]) - with responses.RequestsMock() as rsps: - rsps.add( - responses.POST, - f"{client.session.base_url}tables/{test_tables[0]}/data", - json={}, - ) - client.append_table_data(test_tables[0], frame) # no end_of_data - - -@pytest.mark.enterprise -@pytest.mark.integration -def test__append_table_data__none_without_end_of_data_raises( - client: DataFrameClient, test_tables: List[str] -): - with pytest.raises(ValueError, match="end_of_data must be provided when data is None"): - client.append_table_data(test_tables[0], None) - - -@pytest.mark.enterprise -@pytest.mark.integration -def test__append_table_data__flush_only_with_none(client: DataFrameClient, test_tables: List[str]): - with responses.RequestsMock() as rsps: - rsps.add( - responses.POST, - f"{client.session.base_url}tables/{test_tables[0]}/data", - match=[matchers.json_params_matcher({"endOfData": True})], - json={}, - ) - client.append_table_data(test_tables[0], None, end_of_data=True) - - -@pytest.mark.enterprise -@pytest.mark.integration -def test__append_table_data__empty_iterator_requires_end_of_data(client: DataFrameClient, test_tables: List[str]): - # No end_of_data -> ValueError - with pytest.raises(ValueError, match="end_of_data must be provided when data iterator is empty."): - client.append_table_data(test_tables[0], []) - - # With end_of_data -> sends JSON flush only - with responses.RequestsMock() as rsps: - rsps.add( - responses.POST, - f"{client.session.base_url}tables/{test_tables[0]}/data", - match=[matchers.json_params_matcher({"endOfData": True})], - json={}, - ) - client.append_table_data(test_tables[0], [], end_of_data=True) - - -@pytest.mark.enterprise -@pytest.mark.integration -def test__append_table_data__arrow_iterable_without_pyarrow_raises_runtime_error( - client: DataFrameClient, test_tables: List[str], monkeypatch -): - # Simulate pyarrow not installed by setting module-level pa to None - import nisystemlink.clients.dataframe._data_frame_client as df_module - - monkeypatch.setattr(df_module, "pa", None) - - with pytest.raises(RuntimeError, match="pyarrow is not installed. Install to stream RecordBatches."): - client.append_table_data(test_tables[0], [object()]) - - -@pytest.mark.enterprise -@pytest.mark.integration -def test__append_table_data__arrow_iterable_with_non_recordbatch_elements_raises( - client: DataFrameClient, test_tables: List[str] -): - pytest.importorskip("pyarrow") - # Pass a list whose first element is not a RecordBatch -> should raise ValueError - with pytest.raises(ValueError, match="Iterable provided to data must yield pyarrow.RecordBatch objects."): - client.append_table_data(test_tables[0], [1, 2, 3]) - - -def _arrow_record_batches() -> Iterable["pa.RecordBatch"]: # type: ignore[name-defined] - import pyarrow as pa # type: ignore - - batch = pa.record_batch([pa.array([1, 2, 3])], names=["a"]) - return [batch] - - -@pytest.mark.enterprise -@pytest.mark.integration -def test__append_table_data__arrow_ingestion_400_wrapped_error(client: DataFrameClient, test_tables: List[str]): - pa = pytest.importorskip("pyarrow") # noqa: F841 - - with responses.RequestsMock() as rsps: - # Mock api_info indicating writeData version 1 (no Arrow support) - rsps.add( - responses.GET, - f"{client.session.base_url}", - json={ - "operations": { - "createTables": {"available": True, "version": 1}, - "deleteTables": {"available": True, "version": 1}, - "modifyMetadata": {"available": True, "version": 1}, - "listTables": {"available": True, "version": 1}, - "readData": {"available": True, "version": 1}, - "writeData": {"available": True, "version": 1}, - } - }, - ) - def _callback(request): # type: ignore - return (400, {}, "") - - rsps.add_callback( - responses.POST, - f"{client.session.base_url}tables/{test_tables[0]}/data", - callback=_callback, - content_type="application/vnd.apache.arrow.stream", - ) - - with pytest.raises(ApiException, match="Arrow ingestion request was rejected"): - client.append_table_data(test_tables[0], _arrow_record_batches()) - - -@pytest.mark.enterprise -@pytest.mark.integration -def test__append_table_data__arrow_ingestion_400_not_wrapped_when_version2( - client: DataFrameClient, test_tables: List[str] -): - pa = pytest.importorskip("pyarrow") # noqa: F841 - - with responses.RequestsMock() as rsps: - # Mock api_info indicating writeData version 2 (Arrow support); expect original 400 (no friendly message) - rsps.add( - responses.GET, - f"{client.session.base_url}", - json={ - "operations": { - "createTables": {"available": True, "version": 1}, - "deleteTables": {"available": True, "version": 1}, - "modifyMetadata": {"available": True, "version": 1}, - "listTables": {"available": True, "version": 1}, - "readData": {"available": True, "version": 1}, - "writeData": {"available": True, "version": 2}, - } - }, - ) - - def _callback(request): # type: ignore - return (400, {}, "") - - rsps.add_callback( - responses.POST, - f"{client.session.base_url}tables/{test_tables[0]}/data", - callback=_callback, - content_type="application/vnd.apache.arrow.stream", - ) - - with pytest.raises(ApiException) as excinfo: - client.append_table_data(test_tables[0], _arrow_record_batches()) - # Ensure friendly message NOT used - assert "doesn't support Arrow streaming" not in str(excinfo.value) - - -@pytest.mark.enterprise -@pytest.mark.integration -def test__append_table_data__arrow_ingestion_400_api_info_failure_wrapped( - client: DataFrameClient, test_tables: List[str] -): - pa = pytest.importorskip("pyarrow") # noqa: F841 - - with responses.RequestsMock() as rsps: - # Simulate api_info failure (500) so fallback wrapping should occur - rsps.add( - responses.GET, - f"{client.session.base_url}", - status=500, - json={"error": "server error"}, - ) - - def _callback(request): # type: ignore - return (400, {}, "") - - rsps.add_callback( - responses.POST, - f"{client.session.base_url}tables/{test_tables[0]}/data", - callback=_callback, - content_type="application/vnd.apache.arrow.stream", - ) - - with pytest.raises(ApiException, match="Arrow ingestion request was rejected"): - client.append_table_data(test_tables[0], _arrow_record_batches()) - - -@pytest.mark.enterprise -@pytest.mark.integration -def test__append_table_data__arrow_ingestion_non_400_error_passthrough( - client: DataFrameClient, test_tables: List[str] -): - pa = pytest.importorskip("pyarrow") # noqa: F841 - - with responses.RequestsMock() as rsps: - def _callback(request): # type: ignore - return (415, {}, "") # Unsupported Media Type - - rsps.add_callback( - responses.POST, - f"{client.session.base_url}tables/{test_tables[0]}/data", - callback=_callback, - content_type="application/vnd.apache.arrow.stream", - ) - - with pytest.raises(ApiException) as excinfo: - client.append_table_data(test_tables[0], _arrow_record_batches()) - assert "doesn't support Arrow streaming" not in str(excinfo.value) - assert "415" in str(excinfo.value) - - -@pytest.mark.enterprise -@pytest.mark.integration -def test__append_table_data__arrow_ingestion_with_end_of_data_query_param( - client: DataFrameClient, test_tables: List[str] -): - pa = pytest.importorskip("pyarrow") # noqa: F841 - - with responses.RequestsMock() as rsps: - def _callback(request): # type: ignore - url = request.url.lower() - # Allow either endOfData or end_of_data just in case - assert ("endofdata=true" in url) or ("end_of_data=true" in url) - return (200, {}, "") - - rsps.add_callback( - responses.POST, - f"{client.session.base_url}tables/{test_tables[0]}/data", - callback=_callback, - content_type="application/vnd.apache.arrow.stream", - ) - - client.append_table_data(test_tables[0], _arrow_record_batches(), end_of_data=True) - - -@pytest.mark.enterprise -@pytest.mark.integration -def test__append_table_data__arrow_ingestion_success(client: DataFrameClient, test_tables: List[str]): - pa = pytest.importorskip("pyarrow") # noqa: F841 - - with responses.RequestsMock() as rsps: - # Use callback to assert content-type header (can't easily JSON match binary stream) - def _callback(request): # type: ignore - assert ( - request.headers.get("Content-Type") - == "application/vnd.apache.arrow.stream" - ) - return (200, {}, "") - - rsps.add_callback( - responses.POST, - f"{client.session.base_url}tables/{test_tables[0]}/data", - callback=_callback, - content_type="application/vnd.apache.arrow.stream", - ) - - # Provide iterator of record batches - client.append_table_data(test_tables[0], _arrow_record_batches()) - - -@pytest.mark.enterprise -@pytest.mark.integration -def test__append_table_data__unsupported_type_raises(client: DataFrameClient, test_tables: List[str]): - with pytest.raises(ValueError, match="Unsupported type"): - client.append_table_data(test_tables[0], 123) # type: ignore[arg-type] diff --git a/tests/integration/dataframe/test_dataframe.py b/tests/integration/dataframe/test_dataframe.py index 25c1cd93..3ef8f575 100644 --- a/tests/integration/dataframe/test_dataframe.py +++ b/tests/integration/dataframe/test_dataframe.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- from datetime import datetime, timezone -from typing import List, Optional +from typing import List, Optional, Iterable import pytest # type: ignore import responses @@ -84,6 +84,13 @@ def test_tables(create_table): return ids +def _arrow_record_batches() -> Iterable["pa.RecordBatch"]: # type: ignore[name-defined] + import pyarrow as pa # type: ignore + + batch = pa.record_batch([pa.array([1, 2, 3])], names=["a"]) + return [batch] + + @pytest.mark.enterprise @pytest.mark.integration class TestDataFrame: @@ -615,3 +622,268 @@ def test__export_table_data__works(self, client: DataFrameClient, create_table): response.read() == b'"col1","col2","col3"\r\n1,2.5,6.5\r\n2,1.5,5.5\r\n3,2.5,7.5' ) + + def test__append_table_data__append_request_success( + self, client: DataFrameClient, test_tables: List[str] + ): + frame = DataFrame(data=[["1"], ["2"]]) + request = AppendTableDataRequest(frame=frame, end_of_data=True) + with responses.RequestsMock() as rsps: + rsps.add( + responses.POST, + f"{client.session.base_url}tables/{test_tables[0]}/data", + match=[ + matchers.json_params_matcher( + {"frame": {"data": [["1"], ["2"]]}, "endOfData": True} + ) + ], + json={}, + ) + client.append_table_data(test_tables[0], request) + + def test__append_table_data__append_request_with_end_of_data_argument_disallowed( + self, client: DataFrameClient, test_tables: List[str] + ): + request = AppendTableDataRequest(end_of_data=True) + with pytest.raises(ValueError, match="end_of_data must not be provided separately when passing an AppendTableDataRequest."): + client.append_table_data(test_tables[0], request, end_of_data=True) + + def test__append_table_data__accepts_dataframe_model(self, client: DataFrameClient, test_tables: List[str]): + frame = DataFrame(data=[["1"], ["2"]]) + with responses.RequestsMock() as rsps: + rsps.add( + responses.POST, + f"{client.session.base_url}tables/{test_tables[0]}/data", + match=[ + matchers.json_params_matcher( + {"frame": {"data": [["1"], ["2"]]}, "endOfData": True} + ) + ], + json={}, + ) + assert ( + client.append_table_data(test_tables[0], frame, end_of_data=True) is None + ) + + def test__append_table_data__dataframe_without_end_of_data_success( + self, client: DataFrameClient, test_tables: List[str] + ): + frame = DataFrame(data=[["10"], ["11"]]) + with responses.RequestsMock() as rsps: + rsps.add( + responses.POST, + f"{client.session.base_url}tables/{test_tables[0]}/data", + json={}, + ) + client.append_table_data(test_tables[0], frame) # no end_of_data + + def test__append_table_data__none_without_end_of_data_raises( + self, client: DataFrameClient, test_tables: List[str] + ): + with pytest.raises(ValueError, match="end_of_data must be provided when data is None"): + client.append_table_data(test_tables[0], None) + + def test__append_table_data__flush_only_with_none(self, client: DataFrameClient, test_tables: List[str]): + with responses.RequestsMock() as rsps: + rsps.add( + responses.POST, + f"{client.session.base_url}tables/{test_tables[0]}/data", + match=[matchers.json_params_matcher({"endOfData": True})], + json={}, + ) + client.append_table_data(test_tables[0], None, end_of_data=True) + + def test__append_table_data__empty_iterator_requires_end_of_data(self, client: DataFrameClient, test_tables: List[str]): + # No end_of_data -> ValueError + with pytest.raises(ValueError, match="end_of_data must be provided when data iterator is empty."): + client.append_table_data(test_tables[0], []) + + # With end_of_data -> sends JSON flush only + with responses.RequestsMock() as rsps: + rsps.add( + responses.POST, + f"{client.session.base_url}tables/{test_tables[0]}/data", + match=[matchers.json_params_matcher({"endOfData": True})], + json={}, + ) + client.append_table_data(test_tables[0], [], end_of_data=True) + + def test__append_table_data__arrow_iterable_without_pyarrow_raises_runtime_error( + self, client: DataFrameClient, test_tables: List[str], monkeypatch + ): + # Simulate pyarrow not installed by setting module-level pa to None + import nisystemlink.clients.dataframe._data_frame_client as df_module + + monkeypatch.setattr(df_module, "pa", None) + + with pytest.raises(RuntimeError, match="pyarrow is not installed. Install to stream RecordBatches."): + client.append_table_data(test_tables[0], [object()]) + + def test__append_table_data__arrow_iterable_with_non_recordbatch_elements_raises( + self, client: DataFrameClient, test_tables: List[str] + ): + pytest.importorskip("pyarrow") + # Pass a list whose first element is not a RecordBatch -> should raise ValueError + with pytest.raises(ValueError, match="Iterable provided to data must yield pyarrow.RecordBatch objects."): + client.append_table_data(test_tables[0], [1, 2, 3]) + + def test__append_table_data__arrow_ingestion_400_wrapped_error(self, client: DataFrameClient, test_tables: List[str]): + pa = pytest.importorskip("pyarrow") # noqa: F841 + + with responses.RequestsMock() as rsps: + # Mock api_info indicating writeData version 1 (no Arrow support) + rsps.add( + responses.GET, + f"{client.session.base_url}", + json={ + "operations": { + "createTables": {"available": True, "version": 1}, + "deleteTables": {"available": True, "version": 1}, + "modifyMetadata": {"available": True, "version": 1}, + "listTables": {"available": True, "version": 1}, + "readData": {"available": True, "version": 1}, + "writeData": {"available": True, "version": 1}, + } + }, + ) + def _callback(request): # type: ignore + return (400, {}, "") + + rsps.add_callback( + responses.POST, + f"{client.session.base_url}tables/{test_tables[0]}/data", + callback=_callback, + content_type="application/vnd.apache.arrow.stream", + ) + + with pytest.raises(ApiException, match="Arrow ingestion request was rejected"): + client.append_table_data(test_tables[0], _arrow_record_batches()) + + def test__append_table_data__arrow_ingestion_400_not_wrapped_when_version2( + self, client: DataFrameClient, test_tables: List[str] + ): + pa = pytest.importorskip("pyarrow") # noqa: F841 + + with responses.RequestsMock() as rsps: + # Mock api_info indicating writeData version 2 (Arrow support); expect original 400 (no friendly message) + rsps.add( + responses.GET, + f"{client.session.base_url}", + json={ + "operations": { + "createTables": {"available": True, "version": 1}, + "deleteTables": {"available": True, "version": 1}, + "modifyMetadata": {"available": True, "version": 1}, + "listTables": {"available": True, "version": 1}, + "readData": {"available": True, "version": 1}, + "writeData": {"available": True, "version": 2}, + } + }, + ) + + def _callback(request): # type: ignore + return (400, {}, "") + + rsps.add_callback( + responses.POST, + f"{client.session.base_url}tables/{test_tables[0]}/data", + callback=_callback, + content_type="application/vnd.apache.arrow.stream", + ) + + with pytest.raises(ApiException) as excinfo: + client.append_table_data(test_tables[0], _arrow_record_batches()) + # Ensure friendly message NOT used + assert "doesn't support Arrow streaming" not in str(excinfo.value) + + def test__append_table_data__arrow_ingestion_400_api_info_failure_wrapped( + self, client: DataFrameClient, test_tables: List[str] + ): + pa = pytest.importorskip("pyarrow") # noqa: F841 + + with responses.RequestsMock() as rsps: + # Simulate api_info failure (500) so fallback wrapping should occur + rsps.add( + responses.GET, + f"{client.session.base_url}", + status=500, + json={"error": "server error"}, + ) + + def _callback(request): # type: ignore + return (400, {}, "") + + rsps.add_callback( + responses.POST, + f"{client.session.base_url}tables/{test_tables[0]}/data", + callback=_callback, + content_type="application/vnd.apache.arrow.stream", + ) + + with pytest.raises(ApiException, match="Arrow ingestion request was rejected"): + client.append_table_data(test_tables[0], _arrow_record_batches()) + + def test__append_table_data__arrow_ingestion_non_400_error_passthrough( + self, client: DataFrameClient, test_tables: List[str] + ): + pa = pytest.importorskip("pyarrow") # noqa: F841 + + with responses.RequestsMock() as rsps: + def _callback(request): # type: ignore + return (415, {}, "") # Unsupported Media Type + + rsps.add_callback( + responses.POST, + f"{client.session.base_url}tables/{test_tables[0]}/data", + callback=_callback, + content_type="application/vnd.apache.arrow.stream", + ) + + with pytest.raises(ApiException) as excinfo: + client.append_table_data(test_tables[0], _arrow_record_batches()) + assert "The target DataFrame Service doesn't support Arrow streaming." not in str(excinfo.value) + assert "415" in str(excinfo.value) + + def test__append_table_data__arrow_ingestion_with_end_of_data_query_param( + self, client: DataFrameClient, test_tables: List[str] + ): + pa = pytest.importorskip("pyarrow") # noqa: F841 + + with responses.RequestsMock() as rsps: + def _callback(request): # type: ignore + url = request.url.lower() + # Allow either endOfData or end_of_data just in case + assert ("endofdata=true" in url) or ("end_of_data=true" in url) + return (200, {}, "") + + rsps.add_callback( + responses.POST, + f"{client.session.base_url}tables/{test_tables[0]}/data", + callback=_callback, + content_type="application/vnd.apache.arrow.stream", + ) + + client.append_table_data(test_tables[0], _arrow_record_batches(), end_of_data=True) + + def test__append_table_data__arrow_ingestion_success(self, client: DataFrameClient, test_tables: List[str]): + with responses.RequestsMock() as rsps: + # Use callback to assert content-type header (can't easily JSON match binary stream) + def _callback(request): # type: ignore + assert ( + request.headers.get("Content-Type") + == "application/vnd.apache.arrow.stream" + ) + return (200, {}, "") + + rsps.add_callback( + responses.POST, + f"{client.session.base_url}tables/{test_tables[0]}/data", + callback=_callback, + content_type="application/vnd.apache.arrow.stream", + ) + + client.append_table_data(test_tables[0], _arrow_record_batches()) + + def test__append_table_data__unsupported_type_raises(self, client: DataFrameClient, test_tables: List[str]): + with pytest.raises(ValueError, match="Unsupported type"): + client.append_table_data(test_tables[0], 123) From 6feea37362ed2e5ebbb103f72b86077be8c265e4 Mon Sep 17 00:00:00 2001 From: Cameron Waterman Date: Tue, 9 Sep 2025 16:59:22 -0500 Subject: [PATCH 08/28] Iterating on tests --- .../clients/dataframe/_data_frame_client.py | 18 +- tests/integration/dataframe/test_dataframe.py | 157 +++++++++--------- 2 files changed, 88 insertions(+), 87 deletions(-) diff --git a/nisystemlink/clients/dataframe/_data_frame_client.py b/nisystemlink/clients/dataframe/_data_frame_client.py index 13021e35..86d975f7 100644 --- a/nisystemlink/clients/dataframe/_data_frame_client.py +++ b/nisystemlink/clients/dataframe/_data_frame_client.py @@ -266,11 +266,11 @@ def _append_table_data_json( @post( "tables/{id}/data", - args=[Path, Body, Query], + args=[Path, Body, Query("endOfData")], content_type="application/vnd.apache.arrow.stream", ) def _append_table_data_arrow( - self, id: str, data: Iterable[bytes], end_of_data: bool + self, id: str, data: Iterable[bytes], end_of_data: Optional[bool] = None ) -> None: ... @@ -281,7 +281,7 @@ def append_table_data( Union[ models.AppendTableDataRequest, models.DataFrame, - Iterable["pa.RecordBatch"], # type: ignore[name-defined] + Iterable["pa.RecordBatch"], # type: ignore[name-defined] ] ], *, @@ -313,7 +313,8 @@ def append_table_data( if isinstance(data, models.DataFrame): self._append_table_data_json( - id, models.AppendTableDataRequest(frame=data, end_of_data=end_of_data) + id, + models.AppendTableDataRequest(frame=data, end_of_data=end_of_data) ) return @@ -327,7 +328,8 @@ def append_table_data( "end_of_data must be provided when data iterator is empty." ) self._append_table_data_json( - id, models.AppendTableDataRequest(end_of_data=end_of_data) + id, + models.AppendTableDataRequest(end_of_data=end_of_data), ) return @@ -362,7 +364,11 @@ def _generate_body() -> Iterable[memoryview]: yield slice try: - self._append_table_data_arrow(id, _generate_body(), end_of_data or False) + self._append_table_data_arrow( + id, + _generate_body(), + ("true" if (end_of_data or False) else "false"), + ) except core.ApiException as ex: if ex.http_status_code == 400: wrap = True diff --git a/tests/integration/dataframe/test_dataframe.py b/tests/integration/dataframe/test_dataframe.py index 3ef8f575..ce721999 100644 --- a/tests/integration/dataframe/test_dataframe.py +++ b/tests/integration/dataframe/test_dataframe.py @@ -648,6 +648,20 @@ def test__append_table_data__append_request_with_end_of_data_argument_disallowed with pytest.raises(ValueError, match="end_of_data must not be provided separately when passing an AppendTableDataRequest."): client.append_table_data(test_tables[0], request, end_of_data=True) + def test__append_table_data__append_request_without_end_of_data_success( + self, client: DataFrameClient, test_tables: List[str] + ): + frame = DataFrame(data=[["7"], ["8"]]) + request = AppendTableDataRequest(frame=frame) + with responses.RequestsMock() as rsps: + rsps.add( + responses.POST, + f"{client.session.base_url}tables/{test_tables[0]}/data", + match=[matchers.json_params_matcher({"frame": {"data": [["7"], ["8"]]}})], + json={}, + ) + client.append_table_data(test_tables[0], request) + def test__append_table_data__accepts_dataframe_model(self, client: DataFrameClient, test_tables: List[str]): frame = DataFrame(data=[["1"], ["2"]]) with responses.RequestsMock() as rsps: @@ -655,9 +669,7 @@ def test__append_table_data__accepts_dataframe_model(self, client: DataFrameClie responses.POST, f"{client.session.base_url}tables/{test_tables[0]}/data", match=[ - matchers.json_params_matcher( - {"frame": {"data": [["1"], ["2"]]}, "endOfData": True} - ) + matchers.json_params_matcher({"frame": {"data": [["1"], ["2"]]}, "endOfData": True}), ], json={}, ) @@ -675,7 +687,7 @@ def test__append_table_data__dataframe_without_end_of_data_success( f"{client.session.base_url}tables/{test_tables[0]}/data", json={}, ) - client.append_table_data(test_tables[0], frame) # no end_of_data + client.append_table_data(test_tables[0], frame) def test__append_table_data__none_without_end_of_data_raises( self, client: DataFrameClient, test_tables: List[str] @@ -688,22 +700,24 @@ def test__append_table_data__flush_only_with_none(self, client: DataFrameClient, rsps.add( responses.POST, f"{client.session.base_url}tables/{test_tables[0]}/data", - match=[matchers.json_params_matcher({"endOfData": True})], + match=[ + matchers.json_params_matcher({"endOfData": True}), + ], json={}, ) client.append_table_data(test_tables[0], None, end_of_data=True) def test__append_table_data__empty_iterator_requires_end_of_data(self, client: DataFrameClient, test_tables: List[str]): - # No end_of_data -> ValueError with pytest.raises(ValueError, match="end_of_data must be provided when data iterator is empty."): client.append_table_data(test_tables[0], []) - # With end_of_data -> sends JSON flush only with responses.RequestsMock() as rsps: rsps.add( responses.POST, f"{client.session.base_url}tables/{test_tables[0]}/data", - match=[matchers.json_params_matcher({"endOfData": True})], + match=[ + matchers.json_params_matcher({"endOfData": True}), + ], json={}, ) client.append_table_data(test_tables[0], [], end_of_data=True) @@ -713,7 +727,6 @@ def test__append_table_data__arrow_iterable_without_pyarrow_raises_runtime_error ): # Simulate pyarrow not installed by setting module-level pa to None import nisystemlink.clients.dataframe._data_frame_client as df_module - monkeypatch.setattr(df_module, "pa", None) with pytest.raises(RuntimeError, match="pyarrow is not installed. Install to stream RecordBatches."): @@ -723,18 +736,34 @@ def test__append_table_data__arrow_iterable_with_non_recordbatch_elements_raises self, client: DataFrameClient, test_tables: List[str] ): pytest.importorskip("pyarrow") - # Pass a list whose first element is not a RecordBatch -> should raise ValueError with pytest.raises(ValueError, match="Iterable provided to data must yield pyarrow.RecordBatch objects."): client.append_table_data(test_tables[0], [1, 2, 3]) - def test__append_table_data__arrow_ingestion_400_wrapped_error(self, client: DataFrameClient, test_tables: List[str]): - pa = pytest.importorskip("pyarrow") # noqa: F841 - - with responses.RequestsMock() as rsps: - # Mock api_info indicating writeData version 1 (no Arrow support) + @pytest.mark.parametrize( + "write_version,api_info_failure,expected_wrapped", + [ + (1, False, True), + (2, False, False), + (None, True, True), + ], + ) + def test__append_table_data__arrow_ingestion_400_variants( + self, + client: DataFrameClient, + test_tables: List[str], + write_version: int | None, + api_info_failure: bool, + expected_wrapped: bool, + ): + pa = pytest.importorskip("pyarrow") + def _mock_api_info(rsps: responses.RequestsMock, base_url: str, *, write_version: int | None = None, failure: bool = False) -> None: + if failure: + rsps.add(responses.GET, f"{base_url}", status=500, json={"error": "server error"}) + return + version = 1 if write_version is None else write_version rsps.add( responses.GET, - f"{client.session.base_url}", + f"{base_url}", json={ "operations": { "createTables": {"available": True, "version": 1}, @@ -742,46 +771,19 @@ def test__append_table_data__arrow_ingestion_400_wrapped_error(self, client: Dat "modifyMetadata": {"available": True, "version": 1}, "listTables": {"available": True, "version": 1}, "readData": {"available": True, "version": 1}, - "writeData": {"available": True, "version": 1}, + "writeData": {"available": True, "version": version}, } }, ) - def _callback(request): # type: ignore - return (400, {}, "") - - rsps.add_callback( - responses.POST, - f"{client.session.base_url}tables/{test_tables[0]}/data", - callback=_callback, - content_type="application/vnd.apache.arrow.stream", - ) - - with pytest.raises(ApiException, match="Arrow ingestion request was rejected"): - client.append_table_data(test_tables[0], _arrow_record_batches()) - - def test__append_table_data__arrow_ingestion_400_not_wrapped_when_version2( - self, client: DataFrameClient, test_tables: List[str] - ): - pa = pytest.importorskip("pyarrow") # noqa: F841 - with responses.RequestsMock() as rsps: - # Mock api_info indicating writeData version 2 (Arrow support); expect original 400 (no friendly message) - rsps.add( - responses.GET, - f"{client.session.base_url}", - json={ - "operations": { - "createTables": {"available": True, "version": 1}, - "deleteTables": {"available": True, "version": 1}, - "modifyMetadata": {"available": True, "version": 1}, - "listTables": {"available": True, "version": 1}, - "readData": {"available": True, "version": 1}, - "writeData": {"available": True, "version": 2}, - } - }, + _mock_api_info( + rsps, + client.session.base_url, + write_version=write_version, + failure=api_info_failure, ) - def _callback(request): # type: ignore + def _callback(request): return (400, {}, "") rsps.add_callback( @@ -793,25 +795,20 @@ def _callback(request): # type: ignore with pytest.raises(ApiException) as excinfo: client.append_table_data(test_tables[0], _arrow_record_batches()) - # Ensure friendly message NOT used - assert "doesn't support Arrow streaming" not in str(excinfo.value) + msg = str(excinfo.value) + if expected_wrapped: + assert "Arrow ingestion request was rejected" in msg + else: + assert "Arrow ingestion request was rejected" not in msg - def test__append_table_data__arrow_ingestion_400_api_info_failure_wrapped( + def test__append_table_data__arrow_ingestion_non_400_error_passthrough( self, client: DataFrameClient, test_tables: List[str] ): - pa = pytest.importorskip("pyarrow") # noqa: F841 + pa = pytest.importorskip("pyarrow") with responses.RequestsMock() as rsps: - # Simulate api_info failure (500) so fallback wrapping should occur - rsps.add( - responses.GET, - f"{client.session.base_url}", - status=500, - json={"error": "server error"}, - ) - - def _callback(request): # type: ignore - return (400, {}, "") + def _callback(request): + return (401, {}, "") rsps.add_callback( responses.POST, @@ -820,17 +817,20 @@ def _callback(request): # type: ignore content_type="application/vnd.apache.arrow.stream", ) - with pytest.raises(ApiException, match="Arrow ingestion request was rejected"): + with pytest.raises(ApiException) as excinfo: client.append_table_data(test_tables[0], _arrow_record_batches()) + assert "Arrow ingestion request was rejected" not in str(excinfo.value) + assert "401" in str(excinfo.value) - def test__append_table_data__arrow_ingestion_non_400_error_passthrough( + def test__append_table_data__arrow_ingestion_with_end_of_data_query_param( self, client: DataFrameClient, test_tables: List[str] ): - pa = pytest.importorskip("pyarrow") # noqa: F841 + pa = pytest.importorskip("pyarrow") with responses.RequestsMock() as rsps: - def _callback(request): # type: ignore - return (415, {}, "") # Unsupported Media Type + def _callback(request): + assert "endOfData=true" in request.url + return (200, {}, "") rsps.add_callback( responses.POST, @@ -839,21 +839,16 @@ def _callback(request): # type: ignore content_type="application/vnd.apache.arrow.stream", ) - with pytest.raises(ApiException) as excinfo: - client.append_table_data(test_tables[0], _arrow_record_batches()) - assert "The target DataFrame Service doesn't support Arrow streaming." not in str(excinfo.value) - assert "415" in str(excinfo.value) + client.append_table_data(test_tables[0], _arrow_record_batches(), end_of_data=True) - def test__append_table_data__arrow_ingestion_with_end_of_data_query_param( + def test__append_table_data__arrow_ingestion_with_end_of_data_query_param_false( self, client: DataFrameClient, test_tables: List[str] ): - pa = pytest.importorskip("pyarrow") # noqa: F841 + pa = pytest.importorskip("pyarrow") with responses.RequestsMock() as rsps: - def _callback(request): # type: ignore - url = request.url.lower() - # Allow either endOfData or end_of_data just in case - assert ("endofdata=true" in url) or ("end_of_data=true" in url) + def _callback(request): + assert "endOfData=false" in request.url return (200, {}, "") rsps.add_callback( @@ -863,12 +858,12 @@ def _callback(request): # type: ignore content_type="application/vnd.apache.arrow.stream", ) - client.append_table_data(test_tables[0], _arrow_record_batches(), end_of_data=True) + client.append_table_data(test_tables[0], _arrow_record_batches(), end_of_data=False) def test__append_table_data__arrow_ingestion_success(self, client: DataFrameClient, test_tables: List[str]): with responses.RequestsMock() as rsps: # Use callback to assert content-type header (can't easily JSON match binary stream) - def _callback(request): # type: ignore + def _callback(request): assert ( request.headers.get("Content-Type") == "application/vnd.apache.arrow.stream" From 68a73e2ecd954a326fc5df7a88c54316e2971b7c Mon Sep 17 00:00:00 2001 From: Cameron Waterman Date: Wed, 10 Sep 2025 10:22:57 -0500 Subject: [PATCH 09/28] cleanup --- nisystemlink/clients/dataframe/_data_frame_client.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/nisystemlink/clients/dataframe/_data_frame_client.py b/nisystemlink/clients/dataframe/_data_frame_client.py index 86d975f7..6ea4426c 100644 --- a/nisystemlink/clients/dataframe/_data_frame_client.py +++ b/nisystemlink/clients/dataframe/_data_frame_client.py @@ -373,8 +373,7 @@ def _generate_body() -> Iterable[memoryview]: if ex.http_status_code == 400: wrap = True try: - info = self.api_info() - write_op = getattr(info.operations, "write_data", None) + write_op = getattr(self.api_info().operations, "write_data", None) if write_op is not None and getattr(write_op, "version", 0) >= 2: # Service claims Arrow-capable write version; re-raise original exception wrap = False From 01d2efd16abe0ea06ad08f516d81de47ab9c4d16 Mon Sep 17 00:00:00 2001 From: Cameron Waterman Date: Thu, 11 Sep 2025 15:00:26 -0500 Subject: [PATCH 10/28] Tests talk with real DFS --- .../clients/dataframe/_data_frame_client.py | 46 ++- tests/integration/dataframe/test_dataframe.py | 343 ++++++------------ 2 files changed, 144 insertions(+), 245 deletions(-) diff --git a/nisystemlink/clients/dataframe/_data_frame_client.py b/nisystemlink/clients/dataframe/_data_frame_client.py index 6ea4426c..4673a851 100644 --- a/nisystemlink/clients/dataframe/_data_frame_client.py +++ b/nisystemlink/clients/dataframe/_data_frame_client.py @@ -312,10 +312,14 @@ def append_table_data( return if isinstance(data, models.DataFrame): - self._append_table_data_json( - id, - models.AppendTableDataRequest(frame=data, end_of_data=end_of_data) - ) + # Only include end_of_data in the request if explicitly provided to avoid sending null + if end_of_data is None: + request_model = models.AppendTableDataRequest(frame=data) + else: + request_model = models.AppendTableDataRequest( + frame=data, end_of_data=end_of_data + ) + self._append_table_data_json(id, request_model) return if isinstance(data, Iterable): @@ -344,30 +348,36 @@ def append_table_data( ) def _generate_body() -> Iterable[memoryview]: - data_iter = (b for b in (first_batch, *iterator)) - first = True + data_iter = iter(data) + try: + batch = next(data_iter) + except StopIteration: + return with BytesIO() as buf: - writer = None - for batch in data_iter: - if first: - options = pa.ipc.IpcWriteOptions(compression="zstd") - writer = pa.ipc.new_stream(buf, batch.schema, options=options) - first = False + options = pa.ipc.IpcWriteOptions(compression="zstd") + writer = pa.ipc.new_stream(buf, batch.schema, options=options) + + while True: writer.write_batch(batch) with buf.getbuffer() as view, view[0 : buf.tell()] as slice: yield slice buf.seek(0) - if writer is not None: - writer.close() - with buf.getbuffer() as view: - with view[0 : buf.tell()] as slice: - yield slice + try: + batch = next(data_iter) + except StopIteration: + break + + writer.close() + with buf.getbuffer() as view: + with view[0 : buf.tell()] as slice: + yield slice try: + # Only include the endOfData query when caller specified it. self._append_table_data_arrow( id, _generate_body(), - ("true" if (end_of_data or False) else "false"), + (end_of_data if end_of_data is not None else None), ) except core.ApiException as ex: if ex.http_status_code == 400: diff --git a/tests/integration/dataframe/test_dataframe.py b/tests/integration/dataframe/test_dataframe.py index ce721999..c14f7ba5 100644 --- a/tests/integration/dataframe/test_dataframe.py +++ b/tests/integration/dataframe/test_dataframe.py @@ -84,16 +84,13 @@ def test_tables(create_table): return ids -def _arrow_record_batches() -> Iterable["pa.RecordBatch"]: # type: ignore[name-defined] - import pyarrow as pa # type: ignore - - batch = pa.record_batch([pa.array([1, 2, 3])], names=["a"]) - return [batch] - - @pytest.mark.enterprise @pytest.mark.integration class TestDataFrame: + def _new_single_int_table(self, create_table, column_name: str = "a") -> str: + return create_table( + CreateTableRequest(columns=[Column(name=column_name, data_type=DataType.Int64, column_type=ColumnType.Index)]) + ) def test__api_info__returns(self, client): response = client.api_info() @@ -623,262 +620,154 @@ def test__export_table_data__works(self, client: DataFrameClient, create_table): == b'"col1","col2","col3"\r\n1,2.5,6.5\r\n2,1.5,5.5\r\n3,2.5,7.5' ) - def test__append_table_data__append_request_success( - self, client: DataFrameClient, test_tables: List[str] - ): - frame = DataFrame(data=[["1"], ["2"]]) - request = AppendTableDataRequest(frame=frame, end_of_data=True) - with responses.RequestsMock() as rsps: - rsps.add( - responses.POST, - f"{client.session.base_url}tables/{test_tables[0]}/data", - match=[ - matchers.json_params_matcher( - {"frame": {"data": [["1"], ["2"]]}, "endOfData": True} - ) - ], - json={}, - ) - client.append_table_data(test_tables[0], request) + def test__append_table_data__append_request_success(self, client: DataFrameClient, create_table): + table_id = self._new_single_int_table(create_table) + frame = DataFrame(columns=["a"], data=[["1"], ["2"]]) + client.append_table_data(table_id, AppendTableDataRequest(frame=frame, end_of_data=True)) - def test__append_table_data__append_request_with_end_of_data_argument_disallowed( - self, client: DataFrameClient, test_tables: List[str] - ): + def test__append_table_data__append_request_with_end_of_data_argument_disallowed(self, client: DataFrameClient, create_table): request = AppendTableDataRequest(end_of_data=True) with pytest.raises(ValueError, match="end_of_data must not be provided separately when passing an AppendTableDataRequest."): - client.append_table_data(test_tables[0], request, end_of_data=True) + client.append_table_data(self._new_single_int_table(create_table), request, end_of_data=True) - def test__append_table_data__append_request_without_end_of_data_success( - self, client: DataFrameClient, test_tables: List[str] - ): - frame = DataFrame(data=[["7"], ["8"]]) - request = AppendTableDataRequest(frame=frame) - with responses.RequestsMock() as rsps: - rsps.add( - responses.POST, - f"{client.session.base_url}tables/{test_tables[0]}/data", - match=[matchers.json_params_matcher({"frame": {"data": [["7"], ["8"]]}})], - json={}, - ) - client.append_table_data(test_tables[0], request) + def test__append_table_data__append_request_without_end_of_data_success(self, client: DataFrameClient, create_table): + table_id = self._new_single_int_table(create_table) + frame = DataFrame(columns=["a"], data=[["7"], ["8"]]) + client.append_table_data(table_id, AppendTableDataRequest(frame=frame)) - def test__append_table_data__accepts_dataframe_model(self, client: DataFrameClient, test_tables: List[str]): - frame = DataFrame(data=[["1"], ["2"]]) - with responses.RequestsMock() as rsps: - rsps.add( - responses.POST, - f"{client.session.base_url}tables/{test_tables[0]}/data", - match=[ - matchers.json_params_matcher({"frame": {"data": [["1"], ["2"]]}, "endOfData": True}), - ], - json={}, - ) - assert ( - client.append_table_data(test_tables[0], frame, end_of_data=True) is None - ) + def test__append_table_data__accepts_dataframe_model(self, client: DataFrameClient, create_table): + table_id = self._new_single_int_table(create_table) + frame = DataFrame(columns=["a"], data=[["1"], ["2"]]) + client.append_table_data(table_id, frame, end_of_data=True) - def test__append_table_data__dataframe_without_end_of_data_success( - self, client: DataFrameClient, test_tables: List[str] - ): - frame = DataFrame(data=[["10"], ["11"]]) - with responses.RequestsMock() as rsps: - rsps.add( - responses.POST, - f"{client.session.base_url}tables/{test_tables[0]}/data", - json={}, - ) - client.append_table_data(test_tables[0], frame) + def test__append_table_data__dataframe_without_end_of_data_success(self, client: DataFrameClient, create_table): + table_id = self._new_single_int_table(create_table) + frame = DataFrame(columns=["a"], data=[["10"], ["11"]]) + client.append_table_data(table_id, frame) - def test__append_table_data__none_without_end_of_data_raises( - self, client: DataFrameClient, test_tables: List[str] - ): + def test__append_table_data__none_without_end_of_data_raises(self, client: DataFrameClient, create_table): + table_id = create_table(basic_table_model) with pytest.raises(ValueError, match="end_of_data must be provided when data is None"): - client.append_table_data(test_tables[0], None) + client.append_table_data(table_id, None) - def test__append_table_data__flush_only_with_none(self, client: DataFrameClient, test_tables: List[str]): - with responses.RequestsMock() as rsps: - rsps.add( - responses.POST, - f"{client.session.base_url}tables/{test_tables[0]}/data", - match=[ - matchers.json_params_matcher({"endOfData": True}), - ], - json={}, - ) - client.append_table_data(test_tables[0], None, end_of_data=True) + def test__append_table_data__flush_only_with_none(self, client: DataFrameClient, create_table): + table_id = self._new_single_int_table(create_table) + frame = DataFrame(columns=["a"], data=[["1"]]) + client.append_table_data(table_id, frame) + client.append_table_data(table_id, None, end_of_data=True) + + def test__append_table_data__arrow_ingestion_success(self, client: DataFrameClient, create_table): + pa = pytest.importorskip("pyarrow") + table_id = self._new_single_int_table(create_table) + batch = pa.record_batch([pa.array([10, 11, 12])], names=["a"]) + client.append_table_data(table_id, [batch], end_of_data=True) + with pytest.raises(ApiException): + client.append_table_data(table_id, None, end_of_data=True) - def test__append_table_data__empty_iterator_requires_end_of_data(self, client: DataFrameClient, test_tables: List[str]): + def test__append_table_data__arrow_ingestion_with_end_of_data_query_param_false(self, client: DataFrameClient, create_table): + pa = pytest.importorskip("pyarrow") + table_id = self._new_single_int_table(create_table) + batch1 = pa.record_batch([pa.array([4, 5, 6])], names=["a"]) + client.append_table_data(table_id, [batch1], end_of_data=False) + batch2 = pa.record_batch([pa.array([7, 8])], names=["a"]) + client.append_table_data(table_id, [batch2], end_of_data=True) + + def test__append_table_data__empty_iterator_requires_end_of_data(self, client: DataFrameClient, create_table): + table_id = create_table(basic_table_model) with pytest.raises(ValueError, match="end_of_data must be provided when data iterator is empty."): - client.append_table_data(test_tables[0], []) + client.append_table_data(table_id, []) + client.append_table_data(table_id, [], end_of_data=True) - with responses.RequestsMock() as rsps: - rsps.add( - responses.POST, - f"{client.session.base_url}tables/{test_tables[0]}/data", - match=[ - matchers.json_params_matcher({"endOfData": True}), - ], - json={}, - ) - client.append_table_data(test_tables[0], [], end_of_data=True) + def test__append_table_data__arrow_iterable_with_non_recordbatch_elements_raises(self, client: DataFrameClient, create_table): + pytest.importorskip("pyarrow") + table_id = create_table(basic_table_model) + with pytest.raises(ValueError, match="Iterable provided to data must yield pyarrow.RecordBatch objects."): + client.append_table_data(table_id, [1, 2, 3]) - def test__append_table_data__arrow_iterable_without_pyarrow_raises_runtime_error( - self, client: DataFrameClient, test_tables: List[str], monkeypatch - ): - # Simulate pyarrow not installed by setting module-level pa to None + def test__append_table_data__arrow_iterable_without_pyarrow_raises_runtime_error(self, client: DataFrameClient, create_table, monkeypatch): import nisystemlink.clients.dataframe._data_frame_client as df_module monkeypatch.setattr(df_module, "pa", None) - + table_id = create_table(basic_table_model) with pytest.raises(RuntimeError, match="pyarrow is not installed. Install to stream RecordBatches."): - client.append_table_data(test_tables[0], [object()]) + client.append_table_data(table_id, [object()]) - def test__append_table_data__arrow_iterable_with_non_recordbatch_elements_raises( - self, client: DataFrameClient, test_tables: List[str] - ): - pytest.importorskip("pyarrow") - with pytest.raises(ValueError, match="Iterable provided to data must yield pyarrow.RecordBatch objects."): - client.append_table_data(test_tables[0], [1, 2, 3]) - - @pytest.mark.parametrize( - "write_version,api_info_failure,expected_wrapped", - [ - (1, False, True), - (2, False, False), - (None, True, True), - ], - ) - def test__append_table_data__arrow_ingestion_400_variants( - self, - client: DataFrameClient, - test_tables: List[str], - write_version: int | None, - api_info_failure: bool, - expected_wrapped: bool, - ): + def test__append_table_data__arrow_ingestion_400_unsupported(self, client: DataFrameClient): pa = pytest.importorskip("pyarrow") - def _mock_api_info(rsps: responses.RequestsMock, base_url: str, *, write_version: int | None = None, failure: bool = False) -> None: - if failure: - rsps.add(responses.GET, f"{base_url}", status=500, json={"error": "server error"}) - return - version = 1 if write_version is None else write_version + table_id = "mock_table_id" + bad_batch = pa.record_batch([pa.array([1, 2, 3])], names=["b"]) + api_info_json = { + "operations": { + "create_tables": {"available": True, "version": 1}, + "delete_tables": {"available": True, "version": 1}, + "modify_metadata": {"available": True, "version": 1}, + "list_tables": {"available": True, "version": 1}, + "read_data": {"available": True, "version": 3}, + "write_data": {"available": True, "version": 1}, + } + } + + with responses.RequestsMock() as rsps: rsps.add( responses.GET, - f"{base_url}", - json={ - "operations": { - "createTables": {"available": True, "version": 1}, - "deleteTables": {"available": True, "version": 1}, - "modifyMetadata": {"available": True, "version": 1}, - "listTables": {"available": True, "version": 1}, - "readData": {"available": True, "version": 1}, - "writeData": {"available": True, "version": version}, - } - }, - ) - with responses.RequestsMock() as rsps: - _mock_api_info( - rsps, - client.session.base_url, - write_version=write_version, - failure=api_info_failure, + f"{client.session.base_url}", + json=api_info_json, ) - - def _callback(request): - return (400, {}, "") - - rsps.add_callback( + rsps.add( responses.POST, - f"{client.session.base_url}tables/{test_tables[0]}/data", - callback=_callback, - content_type="application/vnd.apache.arrow.stream", + f"{client.session.base_url}tables/{table_id}/data", + status=400, ) - with pytest.raises(ApiException) as excinfo: - client.append_table_data(test_tables[0], _arrow_record_batches()) - msg = str(excinfo.value) - if expected_wrapped: - assert "Arrow ingestion request was rejected" in msg - else: - assert "Arrow ingestion request was rejected" not in msg - - def test__append_table_data__arrow_ingestion_non_400_error_passthrough( - self, client: DataFrameClient, test_tables: List[str] - ): - pa = pytest.importorskip("pyarrow") + client.append_table_data(table_id, [bad_batch], end_of_data=True) - with responses.RequestsMock() as rsps: - def _callback(request): - return (401, {}, "") + assert "Arrow ingestion request was rejected" in str(excinfo.value) - rsps.add_callback( - responses.POST, - f"{client.session.base_url}tables/{test_tables[0]}/data", - callback=_callback, - content_type="application/vnd.apache.arrow.stream", - ) - - with pytest.raises(ApiException) as excinfo: - client.append_table_data(test_tables[0], _arrow_record_batches()) - assert "Arrow ingestion request was rejected" not in str(excinfo.value) - assert "401" in str(excinfo.value) - - def test__append_table_data__arrow_ingestion_with_end_of_data_query_param( - self, client: DataFrameClient, test_tables: List[str] - ): + def test__append_table_data__arrow_ingestion_400_supported_passthrough(self, client: DataFrameClient): pa = pytest.importorskip("pyarrow") + table_id = "mock_table_id" + bad_batch = pa.record_batch([pa.array([1, 2, 3])], names=["b"]) + api_info_json = { + "operations": { + "create_tables": {"available": True, "version": 1}, + "delete_tables": {"available": True, "version": 1}, + "modify_metadata": {"available": True, "version": 1}, + "list_tables": {"available": True, "version": 1}, + "read_data": {"available": True, "version": 3}, + "write_data": {"available": True, "version": 2}, + } + } with responses.RequestsMock() as rsps: - def _callback(request): - assert "endOfData=true" in request.url - return (200, {}, "") - - rsps.add_callback( - responses.POST, - f"{client.session.base_url}tables/{test_tables[0]}/data", - callback=_callback, - content_type="application/vnd.apache.arrow.stream", + rsps.add( + responses.GET, + f"{client.session.base_url}", + json=api_info_json, ) - - client.append_table_data(test_tables[0], _arrow_record_batches(), end_of_data=True) - - def test__append_table_data__arrow_ingestion_with_end_of_data_query_param_false( - self, client: DataFrameClient, test_tables: List[str] - ): - pa = pytest.importorskip("pyarrow") - - with responses.RequestsMock() as rsps: - def _callback(request): - assert "endOfData=false" in request.url - return (200, {}, "") - - rsps.add_callback( + rsps.add( responses.POST, - f"{client.session.base_url}tables/{test_tables[0]}/data", - callback=_callback, - content_type="application/vnd.apache.arrow.stream", + f"{client.session.base_url}tables/{table_id}/data", + status=400, ) + with pytest.raises(ApiException) as excinfo: + client.append_table_data(table_id, [bad_batch], end_of_data=True) - client.append_table_data(test_tables[0], _arrow_record_batches(), end_of_data=False) + assert "Arrow ingestion request was rejected" not in str(excinfo.value) - def test__append_table_data__arrow_ingestion_success(self, client: DataFrameClient, test_tables: List[str]): + def test__append_table_data__arrow_ingestion_non_400_passthrough(self, client: DataFrameClient): + pa = pytest.importorskip("pyarrow") + table_id = "mock_table_id" + batch = pa.record_batch([pa.array([1, 2, 3])], names=["a"]) with responses.RequestsMock() as rsps: - # Use callback to assert content-type header (can't easily JSON match binary stream) - def _callback(request): - assert ( - request.headers.get("Content-Type") - == "application/vnd.apache.arrow.stream" - ) - return (200, {}, "") - - rsps.add_callback( + rsps.add( responses.POST, - f"{client.session.base_url}tables/{test_tables[0]}/data", - callback=_callback, - content_type="application/vnd.apache.arrow.stream", + f"{client.session.base_url}tables/{table_id}/data", + status=409 ) + with pytest.raises(ApiException) as excinfo: + client.append_table_data(table_id, [batch], end_of_data=True) + assert "Arrow ingestion request was rejected" not in str(excinfo.value) - client.append_table_data(test_tables[0], _arrow_record_batches()) - - def test__append_table_data__unsupported_type_raises(self, client: DataFrameClient, test_tables: List[str]): + def test__append_table_data__unsupported_type_raises(self, client: DataFrameClient, create_table): + table_id = create_table(basic_table_model) with pytest.raises(ValueError, match="Unsupported type"): - client.append_table_data(test_tables[0], 123) + client.append_table_data(table_id, 123) From db95df72e5085012bfca8e167421fd5afa7eaeb9 Mon Sep 17 00:00:00 2001 From: Cameron Waterman Date: Thu, 11 Sep 2025 15:04:30 -0500 Subject: [PATCH 11/28] Cleanup --- nisystemlink/clients/dataframe/_data_frame_client.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/nisystemlink/clients/dataframe/_data_frame_client.py b/nisystemlink/clients/dataframe/_data_frame_client.py index 4673a851..7b583215 100644 --- a/nisystemlink/clients/dataframe/_data_frame_client.py +++ b/nisystemlink/clients/dataframe/_data_frame_client.py @@ -312,7 +312,6 @@ def append_table_data( return if isinstance(data, models.DataFrame): - # Only include end_of_data in the request if explicitly provided to avoid sending null if end_of_data is None: request_model = models.AppendTableDataRequest(frame=data) else: @@ -373,7 +372,6 @@ def _generate_body() -> Iterable[memoryview]: yield slice try: - # Only include the endOfData query when caller specified it. self._append_table_data_arrow( id, _generate_body(), From 221dedeb56241ea518341eeefabc8c8db193c8ed Mon Sep 17 00:00:00 2001 From: Cameron Waterman Date: Thu, 11 Sep 2025 15:11:21 -0500 Subject: [PATCH 12/28] Cleanup --- nisystemlink/clients/dataframe/_data_frame_client.py | 11 ++++------- tests/integration/dataframe/test_dataframe.py | 2 +- 2 files changed, 5 insertions(+), 8 deletions(-) diff --git a/nisystemlink/clients/dataframe/_data_frame_client.py b/nisystemlink/clients/dataframe/_data_frame_client.py index 7b583215..702725d3 100644 --- a/nisystemlink/clients/dataframe/_data_frame_client.py +++ b/nisystemlink/clients/dataframe/_data_frame_client.py @@ -1,12 +1,9 @@ """Implementation of DataFrameClient.""" -try: # Optional pyarrow dependency - import pyarrow as pa # type: ignore -except Exception: # pragma: no cover - pyarrow not installed - pa = None # type: ignore +import pyarrow as pa # type: ignore from collections.abc import Iterable from io import BytesIO -from typing import Any, List, Optional, Union +from typing import List, Optional, Union from nisystemlink.clients import core from nisystemlink.clients.core._uplink._base_client import BaseClient @@ -19,7 +16,7 @@ ) from nisystemlink.clients.core.helpers import IteratorFileLike from requests.models import Response -from uplink import Body, Field, Path, Query, headers, retry +from uplink import Body, Field, Path, Query, retry from . import models @@ -452,7 +449,7 @@ def query_decimated_data( """ ... - def _iter_content_filelike_wrapper(response: Response) -> IteratorFileLike: # type: ignore + def _iter_content_filelike_wrapper(response: Response) -> IteratorFileLike: return IteratorFileLike(response.iter_content(chunk_size=4096)) @response_handler(_iter_content_filelike_wrapper) diff --git a/tests/integration/dataframe/test_dataframe.py b/tests/integration/dataframe/test_dataframe.py index c14f7ba5..18f17594 100644 --- a/tests/integration/dataframe/test_dataframe.py +++ b/tests/integration/dataframe/test_dataframe.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- from datetime import datetime, timezone -from typing import List, Optional, Iterable +from typing import List, Optional import pytest # type: ignore import responses From 5eb05c9a9b759eb203a49cffa8e649df69b3f192 Mon Sep 17 00:00:00 2001 From: Cameron Waterman Date: Thu, 11 Sep 2025 15:28:54 -0500 Subject: [PATCH 13/28] Fix formatting and tests for 3.10 (hopefully) --- .../clients/dataframe/_data_frame_client.py | 59 ++++------ tests/integration/dataframe/test_dataframe.py | 110 ++++++++++++++---- 2 files changed, 107 insertions(+), 62 deletions(-) diff --git a/nisystemlink/clients/dataframe/_data_frame_client.py b/nisystemlink/clients/dataframe/_data_frame_client.py index 702725d3..8aafa043 100644 --- a/nisystemlink/clients/dataframe/_data_frame_client.py +++ b/nisystemlink/clients/dataframe/_data_frame_client.py @@ -3,7 +3,7 @@ import pyarrow as pa # type: ignore from collections.abc import Iterable from io import BytesIO -from typing import List, Optional, Union +from typing import Any, List, Optional, Union from nisystemlink.clients import core from nisystemlink.clients.core._uplink._base_client import BaseClient @@ -252,14 +252,10 @@ def get_table_data( """ ... - @post( - "tables/{id}/data", - args=[Path, Body] - ) + @post("tables/{id}/data", args=[Path, Body]) def _append_table_data_json( self, id: str, data: models.AppendTableDataRequest - ) -> None: - ... + ) -> None: ... @post( "tables/{id}/data", @@ -267,9 +263,8 @@ def _append_table_data_json( content_type="application/vnd.apache.arrow.stream", ) def _append_table_data_arrow( - self, id: str, data: Iterable[bytes], end_of_data: Optional[bool] = None - ) -> None: - ... + self, id: str, data: bytes, end_of_data: Optional[bool] = None + ) -> None: ... def append_table_data( self, @@ -278,7 +273,7 @@ def append_table_data( Union[ models.AppendTableDataRequest, models.DataFrame, - Iterable["pa.RecordBatch"], # type: ignore[name-defined] + Iterable["pa.RecordBatch"], # type: ignore[name-defined] ] ], *, @@ -343,44 +338,34 @@ def append_table_data( "Iterable provided to data must yield pyarrow.RecordBatch objects." ) - def _generate_body() -> Iterable[memoryview]: - data_iter = iter(data) - try: - batch = next(data_iter) - except StopIteration: - return + def _build_body() -> bytes: with BytesIO() as buf: options = pa.ipc.IpcWriteOptions(compression="zstd") - writer = pa.ipc.new_stream(buf, batch.schema, options=options) - - while True: - writer.write_batch(batch) - with buf.getbuffer() as view, view[0 : buf.tell()] as slice: - yield slice - buf.seek(0) - try: - batch = next(data_iter) - except StopIteration: - break - - writer.close() - with buf.getbuffer() as view: - with view[0 : buf.tell()] as slice: - yield slice + with pa.ipc.new_stream( + buf, first_batch.schema, options=options + ) as writer: + writer.write_batch(first_batch) + for batch in iterator: + writer.write_batch(batch) + return buf.getvalue() try: self._append_table_data_arrow( id, - _generate_body(), + _build_body(), (end_of_data if end_of_data is not None else None), ) except core.ApiException as ex: if ex.http_status_code == 400: wrap = True try: - write_op = getattr(self.api_info().operations, "write_data", None) - if write_op is not None and getattr(write_op, "version", 0) >= 2: - # Service claims Arrow-capable write version; re-raise original exception + write_op = getattr( + self.api_info().operations, "write_data", None + ) + if ( + write_op is not None + and getattr(write_op, "version", 0) >= 2 + ): wrap = False except Exception: pass diff --git a/tests/integration/dataframe/test_dataframe.py b/tests/integration/dataframe/test_dataframe.py index 18f17594..e900e9b5 100644 --- a/tests/integration/dataframe/test_dataframe.py +++ b/tests/integration/dataframe/test_dataframe.py @@ -89,8 +89,17 @@ def test_tables(create_table): class TestDataFrame: def _new_single_int_table(self, create_table, column_name: str = "a") -> str: return create_table( - CreateTableRequest(columns=[Column(name=column_name, data_type=DataType.Int64, column_type=ColumnType.Index)]) + CreateTableRequest( + columns=[ + Column( + name=column_name, + data_type=DataType.Int64, + column_type=ColumnType.Index, + ) + ] + ) ) + def test__api_info__returns(self, client): response = client.api_info() @@ -620,43 +629,68 @@ def test__export_table_data__works(self, client: DataFrameClient, create_table): == b'"col1","col2","col3"\r\n1,2.5,6.5\r\n2,1.5,5.5\r\n3,2.5,7.5' ) - def test__append_table_data__append_request_success(self, client: DataFrameClient, create_table): + def test__append_table_data__append_request_success( + self, client: DataFrameClient, create_table + ): table_id = self._new_single_int_table(create_table) frame = DataFrame(columns=["a"], data=[["1"], ["2"]]) - client.append_table_data(table_id, AppendTableDataRequest(frame=frame, end_of_data=True)) + client.append_table_data( + table_id, AppendTableDataRequest(frame=frame, end_of_data=True) + ) - def test__append_table_data__append_request_with_end_of_data_argument_disallowed(self, client: DataFrameClient, create_table): + def test__append_table_data__append_request_with_end_of_data_argument_disallowed( + self, client: DataFrameClient, create_table + ): request = AppendTableDataRequest(end_of_data=True) - with pytest.raises(ValueError, match="end_of_data must not be provided separately when passing an AppendTableDataRequest."): - client.append_table_data(self._new_single_int_table(create_table), request, end_of_data=True) + with pytest.raises( + ValueError, + match="end_of_data must not be provided separately when passing an AppendTableDataRequest.", + ): + client.append_table_data( + self._new_single_int_table(create_table), request, end_of_data=True + ) - def test__append_table_data__append_request_without_end_of_data_success(self, client: DataFrameClient, create_table): + def test__append_table_data__append_request_without_end_of_data_success( + self, client: DataFrameClient, create_table + ): table_id = self._new_single_int_table(create_table) frame = DataFrame(columns=["a"], data=[["7"], ["8"]]) client.append_table_data(table_id, AppendTableDataRequest(frame=frame)) - def test__append_table_data__accepts_dataframe_model(self, client: DataFrameClient, create_table): + def test__append_table_data__accepts_dataframe_model( + self, client: DataFrameClient, create_table + ): table_id = self._new_single_int_table(create_table) frame = DataFrame(columns=["a"], data=[["1"], ["2"]]) client.append_table_data(table_id, frame, end_of_data=True) - def test__append_table_data__dataframe_without_end_of_data_success(self, client: DataFrameClient, create_table): + def test__append_table_data__dataframe_without_end_of_data_success( + self, client: DataFrameClient, create_table + ): table_id = self._new_single_int_table(create_table) frame = DataFrame(columns=["a"], data=[["10"], ["11"]]) client.append_table_data(table_id, frame) - def test__append_table_data__none_without_end_of_data_raises(self, client: DataFrameClient, create_table): + def test__append_table_data__none_without_end_of_data_raises( + self, client: DataFrameClient, create_table + ): table_id = create_table(basic_table_model) - with pytest.raises(ValueError, match="end_of_data must be provided when data is None"): + with pytest.raises( + ValueError, match="end_of_data must be provided when data is None" + ): client.append_table_data(table_id, None) - def test__append_table_data__flush_only_with_none(self, client: DataFrameClient, create_table): + def test__append_table_data__flush_only_with_none( + self, client: DataFrameClient, create_table + ): table_id = self._new_single_int_table(create_table) frame = DataFrame(columns=["a"], data=[["1"]]) client.append_table_data(table_id, frame) client.append_table_data(table_id, None, end_of_data=True) - def test__append_table_data__arrow_ingestion_success(self, client: DataFrameClient, create_table): + def test__append_table_data__arrow_ingestion_success( + self, client: DataFrameClient, create_table + ): pa = pytest.importorskip("pyarrow") table_id = self._new_single_int_table(create_table) batch = pa.record_batch([pa.array([10, 11, 12])], names=["a"]) @@ -664,7 +698,9 @@ def test__append_table_data__arrow_ingestion_success(self, client: DataFrameClie with pytest.raises(ApiException): client.append_table_data(table_id, None, end_of_data=True) - def test__append_table_data__arrow_ingestion_with_end_of_data_query_param_false(self, client: DataFrameClient, create_table): + def test__append_table_data__arrow_ingestion_with_end_of_data_query_param_false( + self, client: DataFrameClient, create_table + ): pa = pytest.importorskip("pyarrow") table_id = self._new_single_int_table(create_table) batch1 = pa.record_batch([pa.array([4, 5, 6])], names=["a"]) @@ -672,26 +708,44 @@ def test__append_table_data__arrow_ingestion_with_end_of_data_query_param_false( batch2 = pa.record_batch([pa.array([7, 8])], names=["a"]) client.append_table_data(table_id, [batch2], end_of_data=True) - def test__append_table_data__empty_iterator_requires_end_of_data(self, client: DataFrameClient, create_table): + def test__append_table_data__empty_iterator_requires_end_of_data( + self, client: DataFrameClient, create_table + ): table_id = create_table(basic_table_model) - with pytest.raises(ValueError, match="end_of_data must be provided when data iterator is empty."): + with pytest.raises( + ValueError, + match="end_of_data must be provided when data iterator is empty.", + ): client.append_table_data(table_id, []) client.append_table_data(table_id, [], end_of_data=True) - def test__append_table_data__arrow_iterable_with_non_recordbatch_elements_raises(self, client: DataFrameClient, create_table): + def test__append_table_data__arrow_iterable_with_non_recordbatch_elements_raises( + self, client: DataFrameClient, create_table + ): pytest.importorskip("pyarrow") table_id = create_table(basic_table_model) - with pytest.raises(ValueError, match="Iterable provided to data must yield pyarrow.RecordBatch objects."): + with pytest.raises( + ValueError, + match="Iterable provided to data must yield pyarrow.RecordBatch objects.", + ): client.append_table_data(table_id, [1, 2, 3]) - def test__append_table_data__arrow_iterable_without_pyarrow_raises_runtime_error(self, client: DataFrameClient, create_table, monkeypatch): + def test__append_table_data__arrow_iterable_without_pyarrow_raises_runtime_error( + self, client: DataFrameClient, create_table, monkeypatch + ): import nisystemlink.clients.dataframe._data_frame_client as df_module + monkeypatch.setattr(df_module, "pa", None) table_id = create_table(basic_table_model) - with pytest.raises(RuntimeError, match="pyarrow is not installed. Install to stream RecordBatches."): + with pytest.raises( + RuntimeError, + match="pyarrow is not installed. Install to stream RecordBatches.", + ): client.append_table_data(table_id, [object()]) - def test__append_table_data__arrow_ingestion_400_unsupported(self, client: DataFrameClient): + def test__append_table_data__arrow_ingestion_400_unsupported( + self, client: DataFrameClient + ): pa = pytest.importorskip("pyarrow") table_id = "mock_table_id" bad_batch = pa.record_batch([pa.array([1, 2, 3])], names=["b"]) @@ -722,7 +776,9 @@ def test__append_table_data__arrow_ingestion_400_unsupported(self, client: DataF assert "Arrow ingestion request was rejected" in str(excinfo.value) - def test__append_table_data__arrow_ingestion_400_supported_passthrough(self, client: DataFrameClient): + def test__append_table_data__arrow_ingestion_400_supported_passthrough( + self, client: DataFrameClient + ): pa = pytest.importorskip("pyarrow") table_id = "mock_table_id" bad_batch = pa.record_batch([pa.array([1, 2, 3])], names=["b"]) @@ -753,7 +809,9 @@ def test__append_table_data__arrow_ingestion_400_supported_passthrough(self, cli assert "Arrow ingestion request was rejected" not in str(excinfo.value) - def test__append_table_data__arrow_ingestion_non_400_passthrough(self, client: DataFrameClient): + def test__append_table_data__arrow_ingestion_non_400_passthrough( + self, client: DataFrameClient + ): pa = pytest.importorskip("pyarrow") table_id = "mock_table_id" batch = pa.record_batch([pa.array([1, 2, 3])], names=["a"]) @@ -761,13 +819,15 @@ def test__append_table_data__arrow_ingestion_non_400_passthrough(self, client: D rsps.add( responses.POST, f"{client.session.base_url}tables/{table_id}/data", - status=409 + status=409, ) with pytest.raises(ApiException) as excinfo: client.append_table_data(table_id, [batch], end_of_data=True) assert "Arrow ingestion request was rejected" not in str(excinfo.value) - def test__append_table_data__unsupported_type_raises(self, client: DataFrameClient, create_table): + def test__append_table_data__unsupported_type_raises( + self, client: DataFrameClient, create_table + ): table_id = create_table(basic_table_model) with pytest.raises(ValueError, match="Unsupported type"): client.append_table_data(table_id, 123) From 579b9d3877befab70e474ae853930f4b4483aa95 Mon Sep 17 00:00:00 2001 From: Cameron Waterman Date: Thu, 11 Sep 2025 15:44:07 -0500 Subject: [PATCH 14/28] Update _data_frame_client.py Make linter happy --- .../clients/dataframe/_data_frame_client.py | 40 ++++++++++++------- 1 file changed, 26 insertions(+), 14 deletions(-) diff --git a/nisystemlink/clients/dataframe/_data_frame_client.py b/nisystemlink/clients/dataframe/_data_frame_client.py index 8aafa043..47c7a3c8 100644 --- a/nisystemlink/clients/dataframe/_data_frame_client.py +++ b/nisystemlink/clients/dataframe/_data_frame_client.py @@ -1,10 +1,10 @@ """Implementation of DataFrameClient.""" -import pyarrow as pa # type: ignore from collections.abc import Iterable from io import BytesIO -from typing import Any, List, Optional, Union +from typing import List, Optional, Union +import pyarrow as pa # type: ignore from nisystemlink.clients import core from nisystemlink.clients.core._uplink._base_client import BaseClient from nisystemlink.clients.core._uplink._methods import ( @@ -255,7 +255,9 @@ def get_table_data( @post("tables/{id}/data", args=[Path, Body]) def _append_table_data_json( self, id: str, data: models.AppendTableDataRequest - ) -> None: ... + ) -> None: + """Internal uplink-implemented JSON append call.""" + ... @post( "tables/{id}/data", @@ -264,7 +266,9 @@ def _append_table_data_json( ) def _append_table_data_arrow( self, id: str, data: bytes, end_of_data: Optional[bool] = None - ) -> None: ... + ) -> None: + """Internal uplink-implemented Arrow (binary) append call.""" + ... def append_table_data( self, @@ -283,18 +287,24 @@ def append_table_data( Args: id: Unique ID of a data table. - data: The data to append in one of the following forms: - - AppendTableDataRequest: Sent as-is via JSON. ``end_of_data`` must be ``None``. - - DataFrame (service model): Wrapped into an AppendTableDataRequest (``end_of_data`` optional) and sent as JSON. - - Iterable[pyarrow.RecordBatch]: Streamed as Arrow IPC. ``end_of_data`` (if provided) is sent as a query parameter. If the iterator yields no batches, it is treated like ``None`` and requires ``end_of_data``. - - None: ``end_of_data`` must be provided; sends JSON containing only the ``endOfData`` flag. - end_of_data: Whether the table should expect any additional rows to be appended in future requests. Required when ``data`` is ``None`` or an empty RecordBatch iterator; must be omitted when supplying an ``AppendTableDataRequest`` (put it inside that model instead). + data: The data to append. Supported forms: + * AppendTableDataRequest: Sent as-is via JSON; ``end_of_data`` must be ``None``. + * DataFrame (service model): Wrapped into an AppendTableDataRequest (``end_of_data`` + optional) and sent as JSON. + * Iterable[pyarrow.RecordBatch]: Streamed as Arrow IPC. ``end_of_data`` (if provided) + is sent as a query parameter. If the iterator yields no batches, it is treated as + ``None`` and requires ``end_of_data``. + * None: ``end_of_data`` must be provided; sends JSON containing only the + ``endOfData`` flag. + end_of_data: Whether additional rows may be appended in future requests. Required when + ``data`` is ``None`` or the RecordBatch iterator is empty; must be omitted when + passing an ``AppendTableDataRequest`` (include it inside that model instead). Raises: ValueError: If parameter constraints are violated. - ApiException: If unable to communicate with the DataFrame Service or an invalid argument is provided. + ApiException: If unable to communicate with the DataFrame Service or an + invalid argument is provided. """ - if isinstance(data, models.AppendTableDataRequest): if end_of_data is not None: raise ValueError( @@ -372,8 +382,10 @@ def _build_body() -> bytes: if wrap: raise core.ApiException( ( - "Arrow ingestion request was rejected. The target DataFrame Service doesn't support Arrow streaming. " - "Install a DataFrame Service version with Arrow support or fall back to JSON ingestion." + "Arrow ingestion request was rejected. The target " + "DataFrame Service doesn't support Arrow streaming. " + "Install a DataFrame Service version with Arrow support " + "or fall back to JSON ingestion." ), error=ex.error, http_status_code=ex.http_status_code, From 05605dd5aef63c4ea3037a1cb9753e0d45f2272f Mon Sep 17 00:00:00 2001 From: Cameron Waterman Date: Thu, 11 Sep 2025 15:55:55 -0500 Subject: [PATCH 15/28] Fix type checking for 3.10 --- tests/integration/dataframe/test_dataframe.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/integration/dataframe/test_dataframe.py b/tests/integration/dataframe/test_dataframe.py index e900e9b5..3eb05625 100644 --- a/tests/integration/dataframe/test_dataframe.py +++ b/tests/integration/dataframe/test_dataframe.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- from datetime import datetime, timezone -from typing import List, Optional +from typing import Any, cast, List, Optional import pytest # type: ignore import responses @@ -830,4 +830,5 @@ def test__append_table_data__unsupported_type_raises( ): table_id = create_table(basic_table_model) with pytest.raises(ValueError, match="Unsupported type"): - client.append_table_data(table_id, 123) + # cast to Any to satisfy type checker while still exercising runtime path + client.append_table_data(table_id, cast(Any, 123)) From 5f128b155f78d854f9d0b539cf0dee23e265538d Mon Sep 17 00:00:00 2001 From: Cameron Waterman Date: Wed, 17 Sep 2025 16:36:02 -0500 Subject: [PATCH 16/28] Address review feedback --- README.rst | 10 ++++ docs/api_reference/dataframe.rst | 24 +++++++++ examples/dataframe/create_write_data.py | 52 ++++++++++++++++--- .../clients/dataframe/_data_frame_client.py | 52 ++++++++++++++----- poetry.lock | 8 ++- pyproject.toml | 5 +- tests/integration/dataframe/test_dataframe.py | 10 ++++ 7 files changed, 138 insertions(+), 23 deletions(-) diff --git a/README.rst b/README.rst index 7db163e9..102baeeb 100644 --- a/README.rst +++ b/README.rst @@ -34,6 +34,16 @@ To install **nisystemlink-clients**, use one of the following methods: $ python -m easy_install nisystemlink-clients +Optional Arrow (pyarrow) Support +-------------------------------- +The base install does not pull in ``pyarrow``. Install the optional extra if you +plan to use Arrow RecordBatch ingestion with ``DataFrameClient.append_table_data``:: + + $ python -m pip install "nisystemlink-clients[pyarrow]" + +Without the extra, Arrow-specific code paths will raise a ``RuntimeError`` when +attempting to append ``pyarrow.RecordBatch`` instances. + .. _usage_section: Usage diff --git a/docs/api_reference/dataframe.rst b/docs/api_reference/dataframe.rst index 27dd7ab9..ebdc9e06 100644 --- a/docs/api_reference/dataframe.rst +++ b/docs/api_reference/dataframe.rst @@ -25,3 +25,27 @@ nisystemlink.clients.dataframe .. automodule:: nisystemlink.clients.dataframe.models :members: :imported-members: + +Arrow / JSON Ingestion Notes +---------------------------- +``append_table_data`` accepts multiple data forms: + +* ``AppendTableDataRequest`` (JSON) +* ``DataFrame`` model (JSON) +* Single ``pyarrow.RecordBatch`` (Arrow IPC) +* Iterable of ``pyarrow.RecordBatch`` (Arrow IPC) +* ``None`` with ``end_of_data`` (flush only) + +Arrow support is optional and requires installing the ``pyarrow`` extra:: + + pip install "nisystemlink-clients[pyarrow]" + +If ``pyarrow`` is not installed and a RecordBatch (or iterable) is passed, a +``RuntimeError`` is raised. When Arrow is used, the batches are serialized into +an IPC stream and sent with content type +``application/vnd.apache.arrow.stream``; the ``end_of_data`` flag is sent as a +query parameter. JSON ingestion places ``endOfData`` in the request body. + +If the target SystemLink DataFrame Service does not yet support Arrow and +responds with HTTP 400, the client raises an explanatory ``ApiException`` +advising to upgrade or fall back to JSON ingestion. diff --git a/examples/dataframe/create_write_data.py b/examples/dataframe/create_write_data.py index a0ef0cd8..03d76964 100644 --- a/examples/dataframe/create_write_data.py +++ b/examples/dataframe/create_write_data.py @@ -1,6 +1,10 @@ import random from datetime import datetime +try: + import pyarrow as pa # type: ignore +except Exception: + pa = None from nisystemlink.clients.dataframe import DataFrameClient from nisystemlink.clients.dataframe.models import ( AppendTableDataRequest, @@ -25,12 +29,48 @@ ) ) -# Generate example data -frame = DataFrame( - data=[[i, random.random(), datetime.now().isoformat()] for i in range(100)] +# Append via explicit AppendTableDataRequest (JSON) +frame_request = DataFrame( + data=[[i, random.random(), datetime.now().isoformat()] for i in range(3)] ) +client.append_table_data(table_id, AppendTableDataRequest(frame=frame_request)) -# Write example data to table -client.append_table_data( - table_id, data=AppendTableDataRequest(frame=frame, endOfData=True) +# Append via DataFrame model directly (JSON) +frame_direct = DataFrame( + data=[[i + 3, random.random(), datetime.now().isoformat()] for i in range(3)] ) +client.append_table_data(table_id, frame_direct) + +if pa is not None: + # Append via single RecordBatch (Arrow) + batch_single = pa.record_batch( + [ + pa.array([6, 7, 8]), + pa.array([0.1, 0.2, 0.3]), + pa.array([datetime.now().isoformat()] * 3), + ], + names=["ix", "Float_Column", "Timestamp_Column"], + ) + client.append_table_data(table_id, batch_single) + + # Append via iterable of RecordBatches (Arrow) + batch_list = [ + pa.record_batch( + [ + pa.array([9, 10]), + pa.array([0.4, 0.5]), + pa.array([datetime.now().isoformat(), datetime.now().isoformat()]), + ], + names=["ix", "Float_Column", "Timestamp_Column"], + ) + ] + client.append_table_data(table_id, batch_list) + + # Flush only (no additional rows, mark end_of_data) + client.append_table_data(table_id, None, end_of_data=True) + + # Empty iterable (must supply end_of_data) + client.append_table_data(table_id, [], end_of_data=True) +else: + # If pyarrow not installed, flush via JSON path + client.append_table_data(table_id, None, end_of_data=True) diff --git a/nisystemlink/clients/dataframe/_data_frame_client.py b/nisystemlink/clients/dataframe/_data_frame_client.py index 47c7a3c8..081cec8a 100644 --- a/nisystemlink/clients/dataframe/_data_frame_client.py +++ b/nisystemlink/clients/dataframe/_data_frame_client.py @@ -4,7 +4,10 @@ from io import BytesIO from typing import List, Optional, Union -import pyarrow as pa # type: ignore +try: + import pyarrow as pa # type: ignore +except Exception: + pa = None from nisystemlink.clients import core from nisystemlink.clients.core._uplink._base_client import BaseClient from nisystemlink.clients.core._uplink._methods import ( @@ -265,7 +268,7 @@ def _append_table_data_json( content_type="application/vnd.apache.arrow.stream", ) def _append_table_data_arrow( - self, id: str, data: bytes, end_of_data: Optional[bool] = None + self, id: str, data: Iterable[bytes], end_of_data: Optional[bool] = None ) -> None: """Internal uplink-implemented Arrow (binary) append call.""" ... @@ -277,6 +280,7 @@ def append_table_data( Union[ models.AppendTableDataRequest, models.DataFrame, + "pa.RecordBatch", # type: ignore[name-defined] Iterable["pa.RecordBatch"], # type: ignore[name-defined] ] ], @@ -291,9 +295,12 @@ def append_table_data( * AppendTableDataRequest: Sent as-is via JSON; ``end_of_data`` must be ``None``. * DataFrame (service model): Wrapped into an AppendTableDataRequest (``end_of_data`` optional) and sent as JSON. + * Single pyarrow.RecordBatch: Treated the same as an iterable containing one batch + and streamed as Arrow IPC. ``end_of_data`` (if provided) is sent as a query + parameter. * Iterable[pyarrow.RecordBatch]: Streamed as Arrow IPC. ``end_of_data`` (if provided) - is sent as a query parameter. If the iterator yields no batches, it is treated as - ``None`` and requires ``end_of_data``. + is sent as a query parameter. If the iterator yields no batches, it is treated + as ``None`` and requires ``end_of_data``. * None: ``end_of_data`` must be provided; sends JSON containing only the ``endOfData`` flag. end_of_data: Whether additional rows may be appended in future requests. Required when @@ -323,6 +330,9 @@ def append_table_data( self._append_table_data_json(id, request_model) return + if pa is not None and isinstance(data, pa.RecordBatch): + data = [data] + if isinstance(data, Iterable): iterator = iter(data) try: @@ -348,22 +358,36 @@ def append_table_data( "Iterable provided to data must yield pyarrow.RecordBatch objects." ) - def _build_body() -> bytes: + def _generate_body() -> Iterable[memoryview]: + data_iter = iter(data) + try: + batch = next(data_iter) + except StopIteration: + return with BytesIO() as buf: options = pa.ipc.IpcWriteOptions(compression="zstd") - with pa.ipc.new_stream( - buf, first_batch.schema, options=options - ) as writer: - writer.write_batch(first_batch) - for batch in iterator: - writer.write_batch(batch) - return buf.getvalue() + writer = pa.ipc.new_stream(buf, batch.schema, options=options) + + while True: + writer.write_batch(batch) + with buf.getbuffer() as view, view[0 : buf.tell()] as slice: + yield slice + buf.seek(0) + try: + batch = next(data_iter) + except StopIteration: + break + + writer.close() + with buf.getbuffer() as view: + with view[0 : buf.tell()] as slice: + yield slice try: self._append_table_data_arrow( id, - _build_body(), - (end_of_data if end_of_data is not None else None), + _generate_body(), + end_of_data, ) except core.ApiException as ex: if ex.http_status_code == 400: diff --git a/poetry.lock b/poetry.lock index 9abcc72f..e6c4e8b5 100644 --- a/poetry.lock +++ b/poetry.lock @@ -778,9 +778,10 @@ poetry-plugin = ["poetry (>=1.0,<2.0)"] name = "pyarrow" version = "21.0.0" description = "Python library for Apache Arrow" -optional = false +optional = true python-versions = ">=3.9" groups = ["main"] +markers = "extra == \"pyarrow\"" files = [ {file = "pyarrow-21.0.0-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:e563271e2c5ff4d4a4cbeb2c83d5cf0d4938b891518e676025f7268c6fe5fe26"}, {file = "pyarrow-21.0.0-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:fee33b0ca46f4c85443d6c450357101e47d53e6c3f008d658c27a2d020d44c79"}, @@ -1451,7 +1452,10 @@ h2 = ["h2 (>=4,<5)"] socks = ["pysocks (>=1.5.6,!=1.5.7,<2.0)"] zstd = ["zstandard (>=0.18.0)"] +[extras] +pyarrow = ["pyarrow"] + [metadata] lock-version = "2.1" python-versions = "^3.9" -content-hash = "92f274ea55d4fc180cd962e9c4da63df36472b7f967ab06ca38527468dbd7ea7" +content-hash = "6c8dde41cc84db4467014785da6deb10f5edbe3d646c9015dbc9638bdf0ead53" diff --git a/pyproject.toml b/pyproject.toml index edc2b189..7c18dc42 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -43,7 +43,10 @@ uplink = [ pydantic = "^2.11.3" pyyaml = "^6.0.1" pandas = "^2.1.0" -pyarrow = "^21.0.0" +pyarrow = { version = "^21.0.0", optional = true } + +[tool.poetry.extras] +pyarrow = ["pyarrow"] [tool.poetry.group.dev.dependencies] black = ">=22.10,<25.0" diff --git a/tests/integration/dataframe/test_dataframe.py b/tests/integration/dataframe/test_dataframe.py index 3eb05625..f3614c45 100644 --- a/tests/integration/dataframe/test_dataframe.py +++ b/tests/integration/dataframe/test_dataframe.py @@ -698,6 +698,16 @@ def test__append_table_data__arrow_ingestion_success( with pytest.raises(ApiException): client.append_table_data(table_id, None, end_of_data=True) + def test__append_table_data__single_recordbatch_success( + self, client: DataFrameClient, create_table + ): + pa = pytest.importorskip("pyarrow") + table_id = self._new_single_int_table(create_table) + batch = pa.record_batch([pa.array([1, 2, 3])], names=["a"]) + client.append_table_data(table_id, batch, end_of_data=True) + with pytest.raises(ApiException): + client.append_table_data(table_id, None, end_of_data=True) + def test__append_table_data__arrow_ingestion_with_end_of_data_query_param_false( self, client: DataFrameClient, create_table ): From b25d53b9750729ab033c542e457f053aea7c18a9 Mon Sep 17 00:00:00 2001 From: Cameron Waterman Date: Mon, 22 Sep 2025 12:04:19 -0500 Subject: [PATCH 17/28] Address second round of review feedback --- examples/dataframe/create_write_data.py | 6 +++--- .../clients/dataframe/_data_frame_client.py | 13 ++++--------- tests/integration/dataframe/test_dataframe.py | 19 ++++++------------- 3 files changed, 13 insertions(+), 25 deletions(-) diff --git a/examples/dataframe/create_write_data.py b/examples/dataframe/create_write_data.py index 03d76964..c842dd79 100644 --- a/examples/dataframe/create_write_data.py +++ b/examples/dataframe/create_write_data.py @@ -66,10 +66,10 @@ ] client.append_table_data(table_id, batch_list) - # Flush only (no additional rows, mark end_of_data) + # Mark end_of_data for the table + # Supply `None` and `end_of_data=True` client.append_table_data(table_id, None, end_of_data=True) - - # Empty iterable (must supply end_of_data) + # OR supply an empty Iterable and `end_of_data=True` client.append_table_data(table_id, [], end_of_data=True) else: # If pyarrow not installed, flush via JSON path diff --git a/nisystemlink/clients/dataframe/_data_frame_client.py b/nisystemlink/clients/dataframe/_data_frame_client.py index 081cec8a..5c6ac261 100644 --- a/nisystemlink/clients/dataframe/_data_frame_client.py +++ b/nisystemlink/clients/dataframe/_data_frame_client.py @@ -359,11 +359,7 @@ def append_table_data( ) def _generate_body() -> Iterable[memoryview]: - data_iter = iter(data) - try: - batch = next(data_iter) - except StopIteration: - return + batch = first_batch with BytesIO() as buf: options = pa.ipc.IpcWriteOptions(compression="zstd") writer = pa.ipc.new_stream(buf, batch.schema, options=options) @@ -374,14 +370,13 @@ def _generate_body() -> Iterable[memoryview]: yield slice buf.seek(0) try: - batch = next(data_iter) + batch = next(iterator) except StopIteration: break writer.close() - with buf.getbuffer() as view: - with view[0 : buf.tell()] as slice: - yield slice + with buf.getbuffer() as view, view[0 : buf.tell()] as slice: + yield slice try: self._append_table_data_arrow( diff --git a/tests/integration/dataframe/test_dataframe.py b/tests/integration/dataframe/test_dataframe.py index f3614c45..44770858 100644 --- a/tests/integration/dataframe/test_dataframe.py +++ b/tests/integration/dataframe/test_dataframe.py @@ -684,9 +684,10 @@ def test__append_table_data__flush_only_with_none( self, client: DataFrameClient, create_table ): table_id = self._new_single_int_table(create_table) - frame = DataFrame(columns=["a"], data=[["1"]]) - client.append_table_data(table_id, frame) client.append_table_data(table_id, None, end_of_data=True) + metadata = client.get_table_metadata(table_id) + assert metadata.row_count == 0 + assert metadata.supports_append is False def test__append_table_data__arrow_ingestion_success( self, client: DataFrameClient, create_table @@ -823,16 +824,9 @@ def test__append_table_data__arrow_ingestion_non_400_passthrough( self, client: DataFrameClient ): pa = pytest.importorskip("pyarrow") - table_id = "mock_table_id" batch = pa.record_batch([pa.array([1, 2, 3])], names=["a"]) - with responses.RequestsMock() as rsps: - rsps.add( - responses.POST, - f"{client.session.base_url}tables/{table_id}/data", - status=409, - ) - with pytest.raises(ApiException) as excinfo: - client.append_table_data(table_id, [batch], end_of_data=True) + with pytest.raises(ApiException) as excinfo: + client.append_table_data("111111111111111111111111", [batch], end_of_data=True) assert "Arrow ingestion request was rejected" not in str(excinfo.value) def test__append_table_data__unsupported_type_raises( @@ -840,5 +834,4 @@ def test__append_table_data__unsupported_type_raises( ): table_id = create_table(basic_table_model) with pytest.raises(ValueError, match="Unsupported type"): - # cast to Any to satisfy type checker while still exercising runtime path - client.append_table_data(table_id, cast(Any, 123)) + client.append_table_data(table_id, 123) # type: ignore[arg-type] From e4d77d96bda85bf33036ba7d2e2a5701698ff8fc Mon Sep 17 00:00:00 2001 From: Cameron Waterman Date: Mon, 22 Sep 2025 14:14:39 -0500 Subject: [PATCH 18/28] format/lint --- tests/integration/dataframe/test_dataframe.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/integration/dataframe/test_dataframe.py b/tests/integration/dataframe/test_dataframe.py index 44770858..0675152a 100644 --- a/tests/integration/dataframe/test_dataframe.py +++ b/tests/integration/dataframe/test_dataframe.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- from datetime import datetime, timezone -from typing import Any, cast, List, Optional +from typing import List, Optional import pytest # type: ignore import responses @@ -826,7 +826,9 @@ def test__append_table_data__arrow_ingestion_non_400_passthrough( pa = pytest.importorskip("pyarrow") batch = pa.record_batch([pa.array([1, 2, 3])], names=["a"]) with pytest.raises(ApiException) as excinfo: - client.append_table_data("111111111111111111111111", [batch], end_of_data=True) + client.append_table_data( + "111111111111111111111111", [batch], end_of_data=True + ) assert "Arrow ingestion request was rejected" not in str(excinfo.value) def test__append_table_data__unsupported_type_raises( From 368ccf0dfdd7a08a449b520e9034eda3bb608a6a Mon Sep 17 00:00:00 2001 From: Cameron Waterman Date: Tue, 23 Sep 2025 11:44:30 -0500 Subject: [PATCH 19/28] Attempt to fix 3.9 tag test issue --- .../core/_internal/_timestamp_utilities.py | 8 ++++- tests/tag/test_tagmanager.py | 29 +++++++------------ 2 files changed, 18 insertions(+), 19 deletions(-) diff --git a/nisystemlink/clients/core/_internal/_timestamp_utilities.py b/nisystemlink/clients/core/_internal/_timestamp_utilities.py index a5cd9ec1..8eb84af9 100644 --- a/nisystemlink/clients/core/_internal/_timestamp_utilities.py +++ b/nisystemlink/clients/core/_internal/_timestamp_utilities.py @@ -30,7 +30,13 @@ def datetime_to_str(cls, value: datetime.datetime) -> str: Returns: The string representation of the timestamp. """ - return datetime.datetime.utcfromtimestamp(value.timestamp()).isoformat() + "Z" + # Use timezone-aware conversion to avoid deprecated utcfromtimestamp usage and + # preserve exact UTC semantics (value assumed either naive UTC or aware). + if value.tzinfo is None: + value = value.replace(tzinfo=datetime.timezone.utc) + else: + value = value.astimezone(datetime.timezone.utc) + return value.isoformat().replace("+00:00", "Z") @classmethod def str_to_datetime(cls, timestamp: str) -> datetime.datetime: diff --git a/tests/tag/test_tagmanager.py b/tests/tag/test_tagmanager.py index 19632a08..597fa8d2 100644 --- a/tests/tag/test_tagmanager.py +++ b/tests/tag/test_tagmanager.py @@ -1,5 +1,4 @@ import asyncio -import time import uuid from collections import OrderedDict from datetime import datetime, timedelta, timezone @@ -2350,7 +2349,7 @@ def test__create_writer_with_buffer_size__sends_when_buffer_full(self): writer.write(path, tbase.DataType.INT32, value1, timestamp=timestamp) writer.write(path, tbase.DataType.INT32, value2, timestamp=timestamp) - utctime = datetime.utcfromtimestamp(timestamp.timestamp()).isoformat() + "Z" + utctime = datetime.fromtimestamp(timestamp.timestamp(), timezone.utc).isoformat().replace("+00:00", "Z") self._client.all_requests.assert_called_once_with( "POST", "/nitag/v2/update-current-values", @@ -2383,12 +2382,9 @@ def test__create_writer_with_buffer_time__sends_when_timer_elapsed(self): writer.write(path, tbase.DataType.INT32, value, timestamp=timestamp) self._client.all_requests.assert_not_called() - for i in range(100): - if self._client.all_requests.call_count > 0: - break - time.sleep(0.01) - - utctime = datetime.utcfromtimestamp(timestamp.timestamp()).isoformat() + "Z" + # Deterministic flush instead of waiting for timer. + writer.send_buffered_writes() + utctime = datetime.fromtimestamp(timestamp.timestamp(), timezone.utc).isoformat().replace("+00:00", "Z") self._client.all_requests.assert_called_once_with( "POST", "/nitag/v2/update-current-values", @@ -2424,8 +2420,7 @@ def test__create_writer_with_buffer_size_and_timer__obeys_both_settings(self): writer1.write(path, tbase.DataType.INT32, value1, timestamp=timestamp) writer1.write(path, tbase.DataType.INT32, value2, timestamp=timestamp) - - utctime = datetime.utcfromtimestamp(timestamp.timestamp()).isoformat() + "Z" + utctime = datetime.fromtimestamp(timestamp.timestamp(), timezone.utc).isoformat().replace("+00:00", "Z") self._client.all_requests.assert_called_once_with( "POST", "/nitag/v2/update-current-values", @@ -2449,10 +2444,8 @@ def test__create_writer_with_buffer_size_and_timer__obeys_both_settings(self): writer2.write(path, tbase.DataType.INT32, value3, timestamp=timestamp) assert 1 == self._client.all_requests.call_count # same as before - for i in range(100): - if self._client.all_requests.call_count > 1: - break - time.sleep(0.01) + # Deterministic flush instead of waiting for timer. + writer2.send_buffered_writes() assert 2 == self._client.all_requests.call_count assert self._client.all_requests.call_args_list[1] == mock.call( @@ -2486,7 +2479,7 @@ def test__read_with_timestamp_and_aggregates__retrieves_all_data_from_server(sel path = "test" value = "success" now = datetime.now(timezone.utc) - utctime = datetime.utcfromtimestamp(now.timestamp()).isoformat() + "Z" + utctime = now.isoformat().replace("+00:00", "Z") self._client.all_requests.configure_mock( side_effect=self._get_mock_request( [ @@ -2554,7 +2547,7 @@ def test__read_with_timestamp__does_not_query_aggregates(self): path = "test" value = "success" now = datetime.now(timezone.utc) - utctime = datetime.utcfromtimestamp(now.timestamp()).isoformat() + "Z" + utctime = now.isoformat().replace("+00:00", "Z") self._client.all_requests.configure_mock( side_effect=self._get_mock_request( [ @@ -2669,7 +2662,7 @@ async def test__read_async_with_timestamp_and_aggregates__retrieves_all_data_fro path = "test" value = "success" now = datetime.now(timezone.utc) - utctime = datetime.utcfromtimestamp(now.timestamp()).isoformat() + "Z" + utctime = now.isoformat().replace("+00:00", "Z") self._client.all_requests.configure_mock( side_effect=self._get_mock_request( [ @@ -2743,7 +2736,7 @@ async def test__read_async_with_timestamp__does_not_query_aggregates(self): path = "test" value = "success" now = datetime.now(timezone.utc) - utctime = datetime.utcfromtimestamp(now.timestamp()).isoformat() + "Z" + utctime = now.isoformat().replace("+00:00", "Z") self._client.all_requests.configure_mock( side_effect=self._get_mock_request( [ From bcab56d03c5845928026f6d234e2bc8fa0326118 Mon Sep 17 00:00:00 2001 From: Cameron Waterman Date: Tue, 23 Sep 2025 11:46:39 -0500 Subject: [PATCH 20/28] format --- tests/tag/test_tagmanager.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/tests/tag/test_tagmanager.py b/tests/tag/test_tagmanager.py index 597fa8d2..6f9ebbbd 100644 --- a/tests/tag/test_tagmanager.py +++ b/tests/tag/test_tagmanager.py @@ -2349,7 +2349,11 @@ def test__create_writer_with_buffer_size__sends_when_buffer_full(self): writer.write(path, tbase.DataType.INT32, value1, timestamp=timestamp) writer.write(path, tbase.DataType.INT32, value2, timestamp=timestamp) - utctime = datetime.fromtimestamp(timestamp.timestamp(), timezone.utc).isoformat().replace("+00:00", "Z") + utctime = ( + datetime.fromtimestamp(timestamp.timestamp(), timezone.utc) + .isoformat() + .replace("+00:00", "Z") + ) self._client.all_requests.assert_called_once_with( "POST", "/nitag/v2/update-current-values", @@ -2384,7 +2388,11 @@ def test__create_writer_with_buffer_time__sends_when_timer_elapsed(self): self._client.all_requests.assert_not_called() # Deterministic flush instead of waiting for timer. writer.send_buffered_writes() - utctime = datetime.fromtimestamp(timestamp.timestamp(), timezone.utc).isoformat().replace("+00:00", "Z") + utctime = ( + datetime.fromtimestamp(timestamp.timestamp(), timezone.utc) + .isoformat() + .replace("+00:00", "Z") + ) self._client.all_requests.assert_called_once_with( "POST", "/nitag/v2/update-current-values", @@ -2420,7 +2428,11 @@ def test__create_writer_with_buffer_size_and_timer__obeys_both_settings(self): writer1.write(path, tbase.DataType.INT32, value1, timestamp=timestamp) writer1.write(path, tbase.DataType.INT32, value2, timestamp=timestamp) - utctime = datetime.fromtimestamp(timestamp.timestamp(), timezone.utc).isoformat().replace("+00:00", "Z") + utctime = ( + datetime.fromtimestamp(timestamp.timestamp(), timezone.utc) + .isoformat() + .replace("+00:00", "Z") + ) self._client.all_requests.assert_called_once_with( "POST", "/nitag/v2/update-current-values", From feb3d43bd64d5ff1c4128ce9a76b14badb8b8f0e Mon Sep 17 00:00:00 2001 From: Cameron Waterman Date: Wed, 24 Sep 2025 08:29:07 -0500 Subject: [PATCH 21/28] Revert to using timer --- tests/tag/test_tagmanager.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/tests/tag/test_tagmanager.py b/tests/tag/test_tagmanager.py index 6f9ebbbd..2e548ea4 100644 --- a/tests/tag/test_tagmanager.py +++ b/tests/tag/test_tagmanager.py @@ -1,4 +1,5 @@ import asyncio +import time import uuid from collections import OrderedDict from datetime import datetime, timedelta, timezone @@ -2386,8 +2387,11 @@ def test__create_writer_with_buffer_time__sends_when_timer_elapsed(self): writer.write(path, tbase.DataType.INT32, value, timestamp=timestamp) self._client.all_requests.assert_not_called() - # Deterministic flush instead of waiting for timer. - writer.send_buffered_writes() + for i in range(100): + if self._client.all_requests.call_count > 0: + break + time.sleep(0.01) + utctime = ( datetime.fromtimestamp(timestamp.timestamp(), timezone.utc) .isoformat() @@ -2456,8 +2460,10 @@ def test__create_writer_with_buffer_size_and_timer__obeys_both_settings(self): writer2.write(path, tbase.DataType.INT32, value3, timestamp=timestamp) assert 1 == self._client.all_requests.call_count # same as before - # Deterministic flush instead of waiting for timer. - writer2.send_buffered_writes() + for i in range(100): + if self._client.all_requests.call_count > 0: + break + time.sleep(0.01) assert 2 == self._client.all_requests.call_count assert self._client.all_requests.call_args_list[1] == mock.call( From 435a192dc8f167b6b03efef908c1832ed1952cb1 Mon Sep 17 00:00:00 2001 From: Cameron Waterman Date: Wed, 24 Sep 2025 08:41:15 -0500 Subject: [PATCH 22/28] 0 -> 1 --- tests/tag/test_tagmanager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/tag/test_tagmanager.py b/tests/tag/test_tagmanager.py index 2e548ea4..775088c7 100644 --- a/tests/tag/test_tagmanager.py +++ b/tests/tag/test_tagmanager.py @@ -2461,7 +2461,7 @@ def test__create_writer_with_buffer_size_and_timer__obeys_both_settings(self): writer2.write(path, tbase.DataType.INT32, value3, timestamp=timestamp) assert 1 == self._client.all_requests.call_count # same as before for i in range(100): - if self._client.all_requests.call_count > 0: + if self._client.all_requests.call_count > 1: break time.sleep(0.01) From e09a04eaca75cad0982e1ec9979a4ed5dab1815b Mon Sep 17 00:00:00 2001 From: Cameron Waterman Date: Wed, 24 Sep 2025 08:54:13 -0500 Subject: [PATCH 23/28] Increase timeout --- tests/tag/test_tagmanager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/tag/test_tagmanager.py b/tests/tag/test_tagmanager.py index 775088c7..0e384cdc 100644 --- a/tests/tag/test_tagmanager.py +++ b/tests/tag/test_tagmanager.py @@ -2387,7 +2387,7 @@ def test__create_writer_with_buffer_time__sends_when_timer_elapsed(self): writer.write(path, tbase.DataType.INT32, value, timestamp=timestamp) self._client.all_requests.assert_not_called() - for i in range(100): + for _ in range(500): if self._client.all_requests.call_count > 0: break time.sleep(0.01) From 4d8f056bef8b3928f4d3c3a356db5a72e9ac98e5 Mon Sep 17 00:00:00 2001 From: Cameron Waterman Date: Wed, 24 Sep 2025 09:36:30 -0500 Subject: [PATCH 24/28] test improvements --- tests/tag/test_tagmanager.py | 65 +++++++++++++++++++++++++++--------- 1 file changed, 50 insertions(+), 15 deletions(-) diff --git a/tests/tag/test_tagmanager.py b/tests/tag/test_tagmanager.py index 0e384cdc..521014bb 100644 --- a/tests/tag/test_tagmanager.py +++ b/tests/tag/test_tagmanager.py @@ -1,6 +1,7 @@ import asyncio import time import uuid +from typing import Optional from collections import OrderedDict from datetime import datetime, timedelta, timezone from unittest import mock @@ -12,6 +13,38 @@ from ..anyorderlist import AnyOrderList +def _wait_for_call_count(mock_obj, expected: int, *, timeout: float = 3.0, min_elapsed: Optional[float] = None): + """Wait until mock_obj.call_count >= expected or timeout. + + Args: + mock_obj: Mock with call_count attribute. + expected: Target call count (>=). + timeout: Max seconds to wait. + min_elapsed: If provided, fail if the condition is met before this many seconds (guards against premature flushes). + + Returns: + Elapsed seconds when condition satisfied. + + Raises: + AssertionError on timeout or premature satisfaction. + """ + start = time.monotonic() + while True: + current = mock_obj.call_count + if current >= expected: + elapsed = time.monotonic() - start + if min_elapsed is not None and elapsed < min_elapsed: + raise AssertionError( + f"Condition reached too early: call_count={current} elapsed={elapsed:.4f}s < min_elapsed={min_elapsed:.4f}s" + ) + return elapsed + if time.monotonic() - start >= timeout: + raise AssertionError( + f"Timed out waiting for call_count >= {expected}. Final={current} timeout={timeout}s" + ) + time.sleep(0.01) + + class TestTagManager(HttpClientTestBase): def setup_method(self, method): super().setup_method(method) @@ -2379,18 +2412,20 @@ def test__create_writer_with_buffer_size__sends_when_buffer_full(self): def test__create_writer_with_buffer_time__sends_when_timer_elapsed(self): path = "tag" value = 1 - writer = self._uut.create_writer(max_buffer_time=timedelta(milliseconds=50)) + buffer_ms = 50 + writer = self._uut.create_writer(max_buffer_time=timedelta(milliseconds=buffer_ms)) timestamp = datetime.now() - self._client.all_requests.configure_mock( - side_effect=self._get_mock_request([None]) - ) + self._client.all_requests.configure_mock(side_effect=self._get_mock_request([None])) writer.write(path, tbase.DataType.INT32, value, timestamp=timestamp) self._client.all_requests.assert_not_called() - for _ in range(500): - if self._client.all_requests.call_count > 0: - break - time.sleep(0.01) + + # Warm wait: allow timer thread to start; > configured buffer time but small overall. + time.sleep(buffer_ms / 1000 * 1.5) + # Should still not have flushed yet (some jitter allowed). If it has, test still passes but we note it. + if self._client.all_requests.call_count == 0: + # Now wait until call observed or timeout, enforcing not too early flush (<20ms) to catch regressions. + _wait_for_call_count(self._client.all_requests, 1, timeout=3.0, min_elapsed=0.02) utctime = ( datetime.fromtimestamp(timestamp.timestamp(), timezone.utc) @@ -2422,8 +2457,9 @@ def test__create_writer_with_buffer_size_and_timer__obeys_both_settings(self): writer1 = self._uut.create_writer( buffer_size=2, max_buffer_time=timedelta(minutes=1) ) + buffer_ms = 50 writer2 = self._uut.create_writer( - buffer_size=2, max_buffer_time=timedelta(milliseconds=50) + buffer_size=2, max_buffer_time=timedelta(milliseconds=buffer_ms) ) timestamp = datetime.now() self._client.all_requests.configure_mock( @@ -2459,12 +2495,11 @@ def test__create_writer_with_buffer_size_and_timer__obeys_both_settings(self): ) writer2.write(path, tbase.DataType.INT32, value3, timestamp=timestamp) - assert 1 == self._client.all_requests.call_count # same as before - for i in range(100): - if self._client.all_requests.call_count > 1: - break - time.sleep(0.01) - + assert 1 == self._client.all_requests.call_count # still only size-based flush so far + # Warm wait beyond buffer time; only after this should timer flush occur. + time.sleep(buffer_ms / 1000 * 1.5) + if self._client.all_requests.call_count == 1: + _wait_for_call_count(self._client.all_requests, 2, timeout=3.0, min_elapsed=0.02) assert 2 == self._client.all_requests.call_count assert self._client.all_requests.call_args_list[1] == mock.call( "POST", From c3230f3958a5e1710f48f1e7fbc73d9c73db8566 Mon Sep 17 00:00:00 2001 From: Cameron Waterman Date: Wed, 24 Sep 2025 09:48:30 -0500 Subject: [PATCH 25/28] iterating --- tests/tag/test_tagmanager.py | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/tests/tag/test_tagmanager.py b/tests/tag/test_tagmanager.py index 521014bb..5bfa381f 100644 --- a/tests/tag/test_tagmanager.py +++ b/tests/tag/test_tagmanager.py @@ -12,8 +12,8 @@ from .http.httpclienttestbase import HttpClientTestBase, MockResponse from ..anyorderlist import AnyOrderList - def _wait_for_call_count(mock_obj, expected: int, *, timeout: float = 3.0, min_elapsed: Optional[float] = None): + """Wait until mock_obj.call_count >= expected or timeout. Args: @@ -2420,12 +2420,15 @@ def test__create_writer_with_buffer_time__sends_when_timer_elapsed(self): writer.write(path, tbase.DataType.INT32, value, timestamp=timestamp) self._client.all_requests.assert_not_called() - # Warm wait: allow timer thread to start; > configured buffer time but small overall. - time.sleep(buffer_ms / 1000 * 1.5) - # Should still not have flushed yet (some jitter allowed). If it has, test still passes but we note it. + warm = buffer_ms / 1000 * 2.5 # generous warm period for slow CI start + time.sleep(warm) if self._client.all_requests.call_count == 0: - # Now wait until call observed or timeout, enforcing not too early flush (<20ms) to catch regressions. - _wait_for_call_count(self._client.all_requests, 1, timeout=3.0, min_elapsed=0.02) + _wait_for_call_count( + self._client.all_requests, + 1, + timeout=5.0, + min_elapsed=buffer_ms / 1000 * 0.4, # at least 40% of interval to avoid racey early flush + ) utctime = ( datetime.fromtimestamp(timestamp.timestamp(), timezone.utc) @@ -2495,11 +2498,16 @@ def test__create_writer_with_buffer_size_and_timer__obeys_both_settings(self): ) writer2.write(path, tbase.DataType.INT32, value3, timestamp=timestamp) - assert 1 == self._client.all_requests.call_count # still only size-based flush so far - # Warm wait beyond buffer time; only after this should timer flush occur. - time.sleep(buffer_ms / 1000 * 1.5) + assert 1 == self._client.all_requests.call_count + # Use longer warm period in CI to allow thread scheduling + time.sleep(buffer_ms / 1000 * 2.5) if self._client.all_requests.call_count == 1: - _wait_for_call_count(self._client.all_requests, 2, timeout=3.0, min_elapsed=0.02) + _wait_for_call_count( + self._client.all_requests, + 2, + timeout=5.0, + min_elapsed=buffer_ms / 1000 * 0.4, + ) assert 2 == self._client.all_requests.call_count assert self._client.all_requests.call_args_list[1] == mock.call( "POST", From 99f8af665e0a7f0e9f6c5d94836aebe66554cab7 Mon Sep 17 00:00:00 2001 From: Cameron Waterman Date: Wed, 24 Sep 2025 10:10:41 -0500 Subject: [PATCH 26/28] cleanup --- tests/tag/test_tagmanager.py | 46 +++++++++++++----------------------- 1 file changed, 16 insertions(+), 30 deletions(-) diff --git a/tests/tag/test_tagmanager.py b/tests/tag/test_tagmanager.py index 5bfa381f..56bb55be 100644 --- a/tests/tag/test_tagmanager.py +++ b/tests/tag/test_tagmanager.py @@ -1,7 +1,6 @@ import asyncio import time import uuid -from typing import Optional from collections import OrderedDict from datetime import datetime, timedelta, timezone from unittest import mock @@ -12,15 +11,14 @@ from .http.httpclienttestbase import HttpClientTestBase, MockResponse from ..anyorderlist import AnyOrderList -def _wait_for_call_count(mock_obj, expected: int, *, timeout: float = 3.0, min_elapsed: Optional[float] = None): +def _wait_for_call_count(mock_obj, expected: int, *, timeout: float = 3.0): """Wait until mock_obj.call_count >= expected or timeout. Args: mock_obj: Mock with call_count attribute. expected: Target call count (>=). timeout: Max seconds to wait. - min_elapsed: If provided, fail if the condition is met before this many seconds (guards against premature flushes). Returns: Elapsed seconds when condition satisfied. @@ -30,17 +28,11 @@ def _wait_for_call_count(mock_obj, expected: int, *, timeout: float = 3.0, min_e """ start = time.monotonic() while True: - current = mock_obj.call_count - if current >= expected: - elapsed = time.monotonic() - start - if min_elapsed is not None and elapsed < min_elapsed: - raise AssertionError( - f"Condition reached too early: call_count={current} elapsed={elapsed:.4f}s < min_elapsed={min_elapsed:.4f}s" - ) - return elapsed + if mock_obj.call_count >= expected: + return time.monotonic() - start if time.monotonic() - start >= timeout: raise AssertionError( - f"Timed out waiting for call_count >= {expected}. Final={current} timeout={timeout}s" + f"Timed out waiting for call_count >= {expected}. Final={mock_obj.call_count} timeout={timeout}s" ) time.sleep(0.01) @@ -2413,22 +2405,20 @@ def test__create_writer_with_buffer_time__sends_when_timer_elapsed(self): path = "tag" value = 1 buffer_ms = 50 - writer = self._uut.create_writer(max_buffer_time=timedelta(milliseconds=buffer_ms)) + writer = self._uut.create_writer( + max_buffer_time=timedelta(milliseconds=buffer_ms) + ) timestamp = datetime.now() - self._client.all_requests.configure_mock(side_effect=self._get_mock_request([None])) + self._client.all_requests.configure_mock( + side_effect=self._get_mock_request([None]) + ) writer.write(path, tbase.DataType.INT32, value, timestamp=timestamp) self._client.all_requests.assert_not_called() - warm = buffer_ms / 1000 * 2.5 # generous warm period for slow CI start - time.sleep(warm) + time.sleep(buffer_ms / 1000 * 2.5) if self._client.all_requests.call_count == 0: - _wait_for_call_count( - self._client.all_requests, - 1, - timeout=5.0, - min_elapsed=buffer_ms / 1000 * 0.4, # at least 40% of interval to avoid racey early flush - ) + _wait_for_call_count(self._client.all_requests, 1, timeout=5.0) utctime = ( datetime.fromtimestamp(timestamp.timestamp(), timezone.utc) @@ -2498,16 +2488,12 @@ def test__create_writer_with_buffer_size_and_timer__obeys_both_settings(self): ) writer2.write(path, tbase.DataType.INT32, value3, timestamp=timestamp) - assert 1 == self._client.all_requests.call_count - # Use longer warm period in CI to allow thread scheduling + assert 1 == self._client.all_requests.call_count # same as before + time.sleep(buffer_ms / 1000 * 2.5) if self._client.all_requests.call_count == 1: - _wait_for_call_count( - self._client.all_requests, - 2, - timeout=5.0, - min_elapsed=buffer_ms / 1000 * 0.4, - ) + _wait_for_call_count(self._client.all_requests, 2, timeout=5.0) + assert 2 == self._client.all_requests.call_count assert self._client.all_requests.call_args_list[1] == mock.call( "POST", From 6067577a07167892438cfb7a9ca17ccc5ac73685 Mon Sep 17 00:00:00 2001 From: Cameron Waterman Date: Wed, 24 Sep 2025 14:28:14 -0500 Subject: [PATCH 27/28] Cleanup example --- examples/dataframe/create_write_data.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/examples/dataframe/create_write_data.py b/examples/dataframe/create_write_data.py index c842dd79..03f52029 100644 --- a/examples/dataframe/create_write_data.py +++ b/examples/dataframe/create_write_data.py @@ -69,8 +69,6 @@ # Mark end_of_data for the table # Supply `None` and `end_of_data=True` client.append_table_data(table_id, None, end_of_data=True) - # OR supply an empty Iterable and `end_of_data=True` - client.append_table_data(table_id, [], end_of_data=True) else: # If pyarrow not installed, flush via JSON path client.append_table_data(table_id, None, end_of_data=True) From 9782bdee1858063987a595423805c745fb6330ad Mon Sep 17 00:00:00 2001 From: Cameron Waterman Date: Wed, 24 Sep 2025 15:05:24 -0500 Subject: [PATCH 28/28] Add test assertions for row_count and supports_append --- tests/integration/dataframe/test_dataframe.py | 27 +++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/tests/integration/dataframe/test_dataframe.py b/tests/integration/dataframe/test_dataframe.py index 0675152a..4596297c 100644 --- a/tests/integration/dataframe/test_dataframe.py +++ b/tests/integration/dataframe/test_dataframe.py @@ -637,6 +637,9 @@ def test__append_table_data__append_request_success( client.append_table_data( table_id, AppendTableDataRequest(frame=frame, end_of_data=True) ) + metadata = client.get_table_metadata(table_id) + assert metadata.row_count == 2 + assert metadata.supports_append is False def test__append_table_data__append_request_with_end_of_data_argument_disallowed( self, client: DataFrameClient, create_table @@ -656,6 +659,9 @@ def test__append_table_data__append_request_without_end_of_data_success( table_id = self._new_single_int_table(create_table) frame = DataFrame(columns=["a"], data=[["7"], ["8"]]) client.append_table_data(table_id, AppendTableDataRequest(frame=frame)) + metadata = client.get_table_metadata(table_id) + assert metadata.row_count == 2 + assert metadata.supports_append is True def test__append_table_data__accepts_dataframe_model( self, client: DataFrameClient, create_table @@ -663,6 +669,9 @@ def test__append_table_data__accepts_dataframe_model( table_id = self._new_single_int_table(create_table) frame = DataFrame(columns=["a"], data=[["1"], ["2"]]) client.append_table_data(table_id, frame, end_of_data=True) + metadata = client.get_table_metadata(table_id) + assert metadata.row_count == 2 + assert metadata.supports_append is False def test__append_table_data__dataframe_without_end_of_data_success( self, client: DataFrameClient, create_table @@ -670,6 +679,9 @@ def test__append_table_data__dataframe_without_end_of_data_success( table_id = self._new_single_int_table(create_table) frame = DataFrame(columns=["a"], data=[["10"], ["11"]]) client.append_table_data(table_id, frame) + metadata = client.get_table_metadata(table_id) + assert metadata.row_count == 2 + assert metadata.supports_append is True def test__append_table_data__none_without_end_of_data_raises( self, client: DataFrameClient, create_table @@ -696,6 +708,9 @@ def test__append_table_data__arrow_ingestion_success( table_id = self._new_single_int_table(create_table) batch = pa.record_batch([pa.array([10, 11, 12])], names=["a"]) client.append_table_data(table_id, [batch], end_of_data=True) + metadata = client.get_table_metadata(table_id) + assert metadata.row_count == 3 + assert metadata.supports_append is False with pytest.raises(ApiException): client.append_table_data(table_id, None, end_of_data=True) @@ -706,6 +721,9 @@ def test__append_table_data__single_recordbatch_success( table_id = self._new_single_int_table(create_table) batch = pa.record_batch([pa.array([1, 2, 3])], names=["a"]) client.append_table_data(table_id, batch, end_of_data=True) + metadata = client.get_table_metadata(table_id) + assert metadata.row_count == 3 + assert metadata.supports_append is False with pytest.raises(ApiException): client.append_table_data(table_id, None, end_of_data=True) @@ -716,8 +734,14 @@ def test__append_table_data__arrow_ingestion_with_end_of_data_query_param_false( table_id = self._new_single_int_table(create_table) batch1 = pa.record_batch([pa.array([4, 5, 6])], names=["a"]) client.append_table_data(table_id, [batch1], end_of_data=False) + metadata = client.get_table_metadata(table_id) + assert metadata.row_count == 3 + assert metadata.supports_append is True batch2 = pa.record_batch([pa.array([7, 8])], names=["a"]) client.append_table_data(table_id, [batch2], end_of_data=True) + metadata = client.get_table_metadata(table_id) + assert metadata.row_count == 5 + assert metadata.supports_append is False def test__append_table_data__empty_iterator_requires_end_of_data( self, client: DataFrameClient, create_table @@ -729,6 +753,9 @@ def test__append_table_data__empty_iterator_requires_end_of_data( ): client.append_table_data(table_id, []) client.append_table_data(table_id, [], end_of_data=True) + metadata = client.get_table_metadata(table_id) + assert metadata.row_count == 0 + assert metadata.supports_append is False def test__append_table_data__arrow_iterable_with_non_recordbatch_elements_raises( self, client: DataFrameClient, create_table