From 13144c3a0b58577f43b814c6f96fb5b04b3c9803 Mon Sep 17 00:00:00 2001 From: Luca Marconato Date: Tue, 3 Feb 2026 15:05:42 +0100 Subject: [PATCH 1/2] improve handling of categoricals for feature_key in points --- src/spatialdata/models/models.py | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/src/spatialdata/models/models.py b/src/spatialdata/models/models.py index e834ad78..d9fd0a1c 100644 --- a/src/spatialdata/models/models.py +++ b/src/spatialdata/models/models.py @@ -819,7 +819,7 @@ def _( # TODO: dask does not allow for setting divisions directly anymore. We have to decide on forcing the user. if feature_key is not None: feature_categ = dd.from_pandas( - data[feature_key].astype(str).astype("category"), + data[feature_key], sort=sort, **kwargs, ) @@ -827,11 +827,21 @@ def _( elif isinstance(data, dd.DataFrame): table = data[[coordinates[ax] for ax in axes]] table.columns = axes - if feature_key is not None: - if data[feature_key].dtype.name == "category": - table[feature_key] = data[feature_key] - else: - table[feature_key] = data[feature_key].astype(str).astype("category") + + if feature_key is not None: + if data[feature_key].dtype.name == "category": + table[feature_key] = data[feature_key] + else: + logger.warning( + f"The `feature_key` column {feature_key} is not categorical, converting it now. " + "Please convert the column to categorical before calling `PointsModel.parse()` to " + "avoid significant performance implications due to the need for dask of computing " + "the categories. If you did not use PointsModel.parse() explicitly in your code (" + "e.g. this message is coming from a reader in `spatialdata_io`, please report " + "this finding." + ) + table[feature_key] = data[feature_key].astype(str).astype("category") + if instance_key is not None: table[instance_key] = data[instance_key] for c in [X, Y, Z]: From 9b60a239e2184e59d6f8290f22ee11272f09bcb9 Mon Sep 17 00:00:00 2001 From: Luca Marconato Date: Tue, 3 Feb 2026 16:36:51 +0100 Subject: [PATCH 2/2] pin distributed; improve warning for categorical points --- pyproject.toml | 1 + src/spatialdata/models/models.py | 38 ++++++++++++++++++-------------- 2 files changed, 22 insertions(+), 17 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 48c61d29..0e813ead 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -27,6 +27,7 @@ dependencies = [ "click", "dask-image", "dask>=2025.2.0,<2026.1.2", + "distributed<2026.1.2", "datashader", "fsspec[s3,http]", "geopandas>=0.14", diff --git a/src/spatialdata/models/models.py b/src/spatialdata/models/models.py index d9fd0a1c..6a126b02 100644 --- a/src/spatialdata/models/models.py +++ b/src/spatialdata/models/models.py @@ -242,6 +242,8 @@ def parse( else: # Chunk single scale images if chunks is not None: + if isinstance(chunks, tuple): + chunks = {dim: chunks[index] for index, dim in enumerate(data.dims)} data = data.chunk(chunks=chunks) cls()._check_chunk_size_not_too_large(data) # recompute coordinates for (multiscale) spatial image @@ -832,14 +834,8 @@ def _( if data[feature_key].dtype.name == "category": table[feature_key] = data[feature_key] else: - logger.warning( - f"The `feature_key` column {feature_key} is not categorical, converting it now. " - "Please convert the column to categorical before calling `PointsModel.parse()` to " - "avoid significant performance implications due to the need for dask of computing " - "the categories. If you did not use PointsModel.parse() explicitly in your code (" - "e.g. this message is coming from a reader in `spatialdata_io`, please report " - "this finding." - ) + # this will cause the categories to be unknown and trigger the warning (and performance slowdown) in + # _add_metadata_and_validate() table[feature_key] = data[feature_key].astype(str).astype("category") if instance_key is not None: @@ -895,15 +891,20 @@ def _add_metadata_and_validate( assert instance_key in data.columns data.attrs[ATTRS_KEY][cls.INSTANCE_KEY] = instance_key - for c in data.columns: - # Here we are explicitly importing the categories - # but it is a convenient way to ensure that the categories are known. - # It also just changes the state of the series, so it is not a big deal. - if isinstance(data[c].dtype, CategoricalDtype) and not data[c].cat.known: - try: - data[c] = data[c].cat.set_categories(data[c].compute().cat.categories) - except ValueError: - logger.info(f"Column `{c}` contains unknown categories. Consider casting it.") + if ( + feature_key is not None + and isinstance(data[feature_key].dtype, CategoricalDtype) + and not data[feature_key].cat.known + ): + logger.warning( + f"The `feature_key` column {feature_key} is categorical with unknown categories. " + "Please ensure the categories are known before calling `PointsModel.parse()` to " + "avoid significant performance implications due to the need for dask to compute " + "the categories. If you did not use PointsModel.parse() explicitly in your code (" + "e.g. this message is coming from a reader in `spatialdata_io`), please report " + "this finding." + ) + data[feature_key] = data[feature_key].cat.set_categories(data[feature_key].compute().cat.categories) _parse_transformations(data, transformations) cls.validate(data) @@ -1163,6 +1164,9 @@ def parse( The parsed data. """ validate_table_attr_keys(adata) + # Convert view to actual copy to avoid ImplicitModificationWarning when modifying .uns + if adata.is_view: + adata = adata.copy() # either all live in adata.uns or all be passed in as argument n_args = sum([region is not None, region_key is not None, instance_key is not None]) if n_args == 0: