From 01854779d1490d5c32c5295be1ace4b755072f39 Mon Sep 17 00:00:00 2001
From: Luca Marconato <m.lucalmer@gmail.com>
Date: Tue, 3 Feb 2026 18:27:30 +0100
Subject: [PATCH 1/3] faster xenium polygon parsing via ragged arrays

---
 src/spatialdata_io/readers/xenium.py | 59 +++++++++++++++++-----------
 1 file changed, 37 insertions(+), 22 deletions(-)

diff --git a/src/spatialdata_io/readers/xenium.py b/src/spatialdata_io/readers/xenium.py
index e4bd1d3f..91b15a02 100644
--- a/src/spatialdata_io/readers/xenium.py
+++ b/src/spatialdata_io/readers/xenium.py
@@ -22,7 +22,7 @@
 from geopandas import GeoDataFrame
 from joblib import Parallel, delayed
 from pyarrow import Table
-from shapely import Polygon
+from shapely import GeometryType, Polygon, from_ragged_array
 from spatialdata import SpatialData
 from spatialdata._core.query.relational_query import get_element_instances
 from spatialdata.models import (
@@ -69,7 +69,6 @@ def xenium(
     morphology_focus: bool = True,
     aligned_images: bool = True,
     cells_table: bool = True,
-    n_jobs: int = 1,
     gex_only: bool = True,
     imread_kwargs: Mapping[str, Any] = MappingProxyType({}),
     image_models_kwargs: Mapping[str, Any] = MappingProxyType({}),
@@ -122,8 +121,6 @@ def xenium(
         `False` and use the `xenium_aligned_image` function directly.
     cells_table
         Whether to read the cell annotations in the `AnnData` table.
-    n_jobs
-        Number of jobs to use for parallel processing.
     gex_only
         Whether to load only the "Gene Expression" feature type.
     imread_kwargs
@@ -261,7 +258,6 @@ def xenium(
             path,
             XeniumKeys.NUCLEUS_BOUNDARIES_FILE,
             specs,
-            n_jobs,
             idx=table.obs[str(XeniumKeys.CELL_ID)].copy(),
         )
 
@@ -270,7 +266,6 @@ def xenium(
             path,
             XeniumKeys.CELL_BOUNDARIES_FILE,
             specs,
-            n_jobs,
             idx=table.obs[str(XeniumKeys.CELL_ID)].copy(),
         )
 
@@ -406,7 +401,7 @@ def filter(self, record: logging.LogRecord) -> bool:
 
 def _decode_cell_id_column(cell_id_column: pd.Series) -> pd.Series:
     if isinstance(cell_id_column.iloc[0], bytes):
-        return cell_id_column.apply(lambda x: x.decode("utf-8"))
+        return cell_id_column.str.decode("utf-8")
     return cell_id_column
 
 
@@ -414,39 +409,59 @@ def _get_polygons(
     path: Path,
     file: str,
     specs: dict[str, Any],
-    n_jobs: int,
     idx: ArrayLike | None = None,
 ) -> GeoDataFrame:
     # seems to be faster than pd.read_parquet
     df = pq.read_table(path / file).to_pandas()
+    cell_ids = df[XeniumKeys.CELL_ID].to_numpy()
+    x = df[XeniumKeys.BOUNDARIES_VERTEX_X].to_numpy()
+    y = df[XeniumKeys.BOUNDARIES_VERTEX_Y].to_numpy()
+    coords = np.column_stack([x, y])
+
+    change_mask = np.concatenate([[True], cell_ids[1:] != cell_ids[:-1]])
+    group_starts = np.where(change_mask)[0]
+    group_ends = np.concatenate([group_starts[1:], [len(cell_ids)]])
+
+    # sanity check
+    n_unique_ids = len(df[XeniumKeys.CELL_ID].drop_duplicates())
+    if len(group_starts) != n_unique_ids:
+        raise ValueError(
+            f"In {file}, rows belonging to the same polygon must be contiguous. "
+            f"Expected {n_unique_ids} group starts, but found {len(group_starts)}. "
+            f"This indicates non-consecutive polygon rows."
+        )
 
-    group_by = df.groupby(XeniumKeys.CELL_ID)
-    index = pd.Series(group_by.indices.keys())
-    # convert the index to str since we will compare it with an AnnData object, where the index is a str
-    index.index = index.index.astype(str)
-    index = _decode_cell_id_column(index)
+    unique_ids = cell_ids[group_starts]
 
-    out = Parallel(n_jobs=n_jobs)(
-        delayed(Polygon)(i.to_numpy())
-        for _, i in group_by[[XeniumKeys.BOUNDARIES_VERTEX_X, XeniumKeys.BOUNDARIES_VERTEX_Y]]
-    )
-    geo_df = GeoDataFrame({"geometry": out})
+    # offsets for ragged array:
+    # offsets[0] (ring_offsets): describing to which rings the vertex positions belong to
+    # offsets[1] (geom_offsets): describing to which polygons the rings belong to
+    ring_offsets = np.concatenate([[0], group_ends])  # vertex positions
+    geom_offsets = np.arange(len(group_starts) + 1)  # [0, 1, 2, ..., n_polygons]
+
+    geoms = from_ragged_array(GeometryType.POLYGON, coords, offsets=(ring_offsets, geom_offsets))
+
+    index = _decode_cell_id_column(pd.Series(unique_ids))
+    geo_df = GeoDataFrame({"geometry": geoms}, index=index.values)
+
+    import time
+
+    start = time.time()
     version = _parse_version_of_xenium_analyzer(specs)
     if version is not None and version < packaging.version.parse("2.0.0"):
         assert idx is not None
         assert len(idx) == len(geo_df)
-        assert np.unique(geo_df.index).size == len(geo_df)
         assert index.equals(idx)
-        geo_df.index = idx
     else:
-        geo_df.index = index
-        if not np.unique(geo_df.index).size == len(geo_df):
+        if np.unique(geo_df.index).size != len(geo_df):
             warnings.warn(
                 "Found non-unique polygon indices, this will be addressed in a future version of the reader. For the "
                 "time being please consider merging polygons with non-unique indices into single multi-polygons.",
                 UserWarning,
                 stacklevel=2,
             )
+    print(f"sanity check idx vs index: {time.time() - start}")
+
     scale = Scale([1.0 / specs["pixel_size"], 1.0 / specs["pixel_size"]], axes=("x", "y"))
     return ShapesModel.parse(geo_df, transformations={"global": scale})
 

From 7b73d334c876ea8cf17852ca4d3afa3637702752 Mon Sep 17 00:00:00 2001
From: Luca Marconato <m.lucalmer@gmail.com>
Date: Tue, 3 Feb 2026 18:28:22 +0100
Subject: [PATCH 2/3] remove print

---
 src/spatialdata_io/readers/xenium.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/src/spatialdata_io/readers/xenium.py b/src/spatialdata_io/readers/xenium.py
index 91b15a02..3d3224c6 100644
--- a/src/spatialdata_io/readers/xenium.py
+++ b/src/spatialdata_io/readers/xenium.py
@@ -444,9 +444,6 @@ def _get_polygons(
     index = _decode_cell_id_column(pd.Series(unique_ids))
     geo_df = GeoDataFrame({"geometry": geoms}, index=index.values)
 
-    import time
-
-    start = time.time()
     version = _parse_version_of_xenium_analyzer(specs)
     if version is not None and version < packaging.version.parse("2.0.0"):
         assert idx is not None
@@ -460,7 +457,6 @@ def _get_polygons(
                 UserWarning,
                 stacklevel=2,
             )
-    print(f"sanity check idx vs index: {time.time() - start}")
 
     scale = Scale([1.0 / specs["pixel_size"], 1.0 / specs["pixel_size"]], axes=("x", "y"))
     return ShapesModel.parse(geo_df, transformations={"global": scale})

From dc8e297e45cf039a3dd4e04ca8f1d3afb582d724 Mon Sep 17 00:00:00 2001
From: Luca Marconato <m.lucalmer@gmail.com>
Date: Wed, 4 Feb 2026 11:33:58 +0100
Subject: [PATCH 3/3] remove n_jobs from xenium cli

---
 src/spatialdata_io/__main__.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/spatialdata_io/__main__.py b/src/spatialdata_io/__main__.py
index 40c45518..35fd24de 100644
--- a/src/spatialdata_io/__main__.py
+++ b/src/spatialdata_io/__main__.py
@@ -480,7 +480,6 @@ def visium_hd_wrapper(
     default=True,
     help="Whether to read cells annotations in the AnnData table. [default: True]",
 )
-@click.option("--n-jobs", type=int, default=1, help="Number of jobs. [default: 1]")
 def xenium_wrapper(
     input: str,
     output: str,
@@ -495,7 +494,6 @@ def xenium_wrapper(
     morphology_focus: bool = True,
     aligned_images: bool = True,
     cells_table: bool = True,
-    n_jobs: int = 1,
 ) -> None:
     """Xenium conversion to SpatialData."""
     sdata = xenium(  # type: ignore[name-defined] # noqa: F821
@@ -510,7 +508,6 @@ def xenium_wrapper(
         morphology_focus=morphology_focus,
         aligned_images=aligned_images,
         cells_table=cells_table,
-        n_jobs=n_jobs,
     )
     sdata.write(output)