From 18bffac9f6ac722c57e186ce7e9dc1b7cc9a54fe Mon Sep 17 00:00:00 2001
From: mhucka <mhucka@google.com>
Date: Fri, 2 Jan 2026 00:58:47 +0000
Subject: [PATCH 01/33] Move most metadata from setup.py to pyproject.toml

The modern approach is to put more things into `pyproject.toml`. This
moves most things out of `setup.py`.

The version number is now read from from the file
`qsimcirq/_version.py` in both setup.py and pyproject.toml, so that
there is a single source of truth for that value.
---
 pyproject.toml | 135 +++++++++++++++++++++++++++++++++++++++++++++----
 setup.py       |  91 +++------------------------------
 2 files changed, 132 insertions(+), 94 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 37d714455..e519fedd2 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -12,28 +12,133 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+# Note: there are altogether 3 types of dependencies listed in this file:
+#
+#  [build-system].requires: the packages needed for the build system. This list
+#  is not stored in the package metadata.
+#
+#  [project].dependencies: other packages are minimally needed to be able to
+#  install and run qsimcirq. These are things like Cirq, NumPy, etc. Equivalent
+#  to "install_requires" in setuptools' setup.py. The list gets stored in the
+#  metadata of the package; when the project is installed by pip, this is the
+#  specification that is used to install its dependencies.
+#
+#  [dependency-groups].dev: the development dependencies; i.e., what a
+#  developer needs in order to run unit tests, linters, and formatters. The
+#  "[dependency-groups]" section is a Python packaging feature introduced in
+#  2025. This list is not stored in the metadata of the package. To install the
+#  development dependencies, use "pip install --group dev".
+
 [build-system]
+build-backend = "setuptools.build_meta"
 requires = [
-  "packaging",
-  "setuptools>=78.1.1",
-  "pybind11[global]",
-  # "pip install" from sources needs to build Pybind, which needs CMake too.
-  "cmake~=3.28.1",
+    "setuptools>=78.1.1",
+    "setuptools-scm[toml]>=6.2",
+    "wheel",
 ]
-build-backend = "setuptools.build_meta"
+
+[project]
+name = "qsimcirq"
+description = "High-performance quantum circuit simulator for C++ and Python."
+authors = [
+    { name = "The qsim/qsimh Developers", email = "qsim-qsimh-dev@googlegroups.com" }
+]
+maintainers = [
+    { name = "Google Quantum AI", email = "quantum-oss-maintainers@google.com" }
+]
+readme = {file = "README.md", content-type = "text/markdown"}
+license = "Apache-2.0"
+requires-python = ">=3.10.0"
+dynamic = ["version", "dependencies"]
+classifiers = [
+    "Development Status :: 5 - Production/Stable",
+    "Environment :: GPU :: NVIDIA CUDA",
+    "Intended Audience :: Developers",
+    "Intended Audience :: Science/Research",
+    "Operating System :: MacOS :: MacOS X",
+    "Operating System :: Microsoft :: Windows",
+    "Operating System :: POSIX :: Linux",
+    "Programming Language :: C++",
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
+    "Programming Language :: Python :: 3.13",
+    "Topic :: Scientific/Engineering :: Quantum Computing",
+    "Topic :: Software Development :: Libraries :: Python Modules",
+    "Typing :: Typed",
+]
+keywords = [
+    "algorithms",
+    "cirq",
+    "nisq",
+    "quantum algorithm development",
+    "quantum circuit simulator",
+    "quantum computer simulator",
+    "quantum computing",
+    "quantum programming",
+    "quantum simulation",
+    "quantum",
+    "schrödinger-feynman simulation",
+    "simulation",
+    "state vector simulator",
+]
+
+[project.urls]
+documentation = "https://quantumai.google/qsim"
+download = "https://pypi.org/project/qsimcirq/#files"
+homepage = "https://quantumai.google/qsim"
+issues = "https://github.com/quantumlib/qsim/issues"
+source = "https://github.com/quantumlib/qsim"
+
+[dependency-groups]
+# Development dependencies. Install these with "pip install --group dev".
+dev = [
+    "black~=25.9.0",
+    "cibuildwheel",
+    "flynt~=1.0",
+    "isort[colors]~=6.0.1",
+    "py-cpuinfo",
+    "pylint~=4.0.2",
+    "pytest",
+    "pytest-xdist",
+]
+
+[tool.setuptools]
+packages = ["qsimcirq"]
+package-data = {"qsimcirq" = ["py.typed"]}
+
+[tool.setuptools.dynamic]
+# The next one becomes the value of [project].version.
+version = {attr = "qsimcirq._version.__version__"}
+# The next one becomes [project].dependencies, equivalent to "install_requires"
+# in setuptools' setup.py. "pip install qsim" installs these automatically.
+dependencies = {file = ["requirements.txt"] }
 
 [tool.cibuildwheel]
-test-extras = "dev"
+build = "cp310-* cp311-* cp312-* cp313-*"
 dependency-versions = "latest"
 enable = ["cpython-prerelease"]
 environment.PIP_PREFER_BINARY = "1"
 # Due to package & module name conflict, temporarily move it away to run tests:
-before-test = "mv {package}/qsimcirq /tmp"
-test-command = "pytest -s -v {package}/qsimcirq_tests/qsimcirq_test.py && mv /tmp/qsimcirq {package}"
+before-test = "pip install --group dev && mv {package}/qsimcirq /tmp"
+test-command = """
+pytest -n auto -s -v {package}/qsimcirq_tests/qsimcirq_test.py &&
+mv /tmp/qsimcirq {package}
+"""
 
 [tool.cibuildwheel.macos]
-before-build = "brew install -q libomp llvm@19 && brew unlink libomp && brew unlink llvm@19 && brew link --force libomp && brew link --force llvm@19"
-repair-wheel-command = "delocate-listdeps {wheel} && delocate-wheel --verbose --require-archs {delocate_archs} -w {dest_dir} {wheel}"
+before-build = """
+brew install -q libomp llvm@19 &&
+brew unlink libomp &&
+brew unlink llvm@19 &&
+brew link --force libomp &&
+brew link --force llvm@19
+"""
+repair-wheel-command = """
+delocate-listdeps {wheel} &&
+delocate-wheel --verbose --require-archs {delocate_archs} -w {dest_dir} {wheel}
+"""
 
 [tool.cibuildwheel.linux]
 manylinux-x86_64-image = "manylinux2014"
@@ -43,3 +148,11 @@ skip = "*musllinux*"
 [tool.black]
 target-version = ['py310', 'py311', 'py312', 'py313']
 extend-exclude = 'third_party'
+
+[tool.isort]
+profile = 'black'
+order_by_type = false  # Sort alphabetically, irrespective of case.
+skip_gitignore = true
+combine_as_imports = true
+known_first_party = ["cirq*"]
+extend_skip = ["__init__.py"]
diff --git a/setup.py b/setup.py
index 328b27cdd..d3ccfa54c 100644
--- a/setup.py
+++ b/setup.py
@@ -24,6 +24,10 @@
 from setuptools import Extension, setup
 from setuptools.command.build_ext import build_ext
 
+# qsimcirq/_version.py contains the source of truth for the version nhumber.
+__version__ = runpy.run_path("qsimcirq/_version.py")["__version__"]
+assert __version__, "The version string must not be empty"
+
 
 class CMakeExtension(Extension):
     def __init__(self, name, sourcedir=""):
@@ -67,6 +71,8 @@ def build_extension(self, ext):
                 "-DCMAKE_CUDA_COMPILER=nvcc",
             ]
 
+        # Append additional CMake arguments from the environment variable.
+        # This is e.g. used by cibuildwheel to force a certain C++ standard.
         additional_cmake_args = os.environ.get("CMAKE_ARGS", "")
         if additional_cmake_args:
             cmake_args += additional_cmake_args.split()
@@ -110,9 +116,7 @@ def build_extension(self, ext):
 
         env = os.environ.copy()
         cxxflags = env.get("CXXFLAGS", "")
-        env["CXXFLAGS"] = (
-            f'{cxxflags} -DVERSION_INFO=\\"{self.distribution.get_version()}\\"'
-        )
+        env["CXXFLAGS"] = f'{cxxflags} -DVERSION_INFO=\\"{__version__}\\"'
         if not os.path.exists(self.build_temp):
             os.makedirs(self.build_temp)
         subprocess.check_call(
@@ -124,42 +128,7 @@ def build_extension(self, ext):
         )
 
 
-with open("requirements.txt") as f:
-    requirements = [
-        line.strip() for line in f if line.strip() and not line.strip().startswith("#")
-    ]
-with open("dev-requirements.txt") as f:
-    dev_requirements = [
-        line.strip() for line in f if line.strip() and not line.strip().startswith("#")
-    ]
-
-description = "Schrödinger and Schrödinger-Feynman simulators for quantum circuits."
-
-# README file as long_description.
-with open("README.md", encoding="utf-8") as f:
-    long_description = f.read()
-
-__version__ = runpy.run_path("qsimcirq/_version.py")["__version__"]
-if not __version__:
-    raise ValueError("Version string cannot be empty")
-
 setup(
-    name="qsimcirq",
-    version=__version__,
-    url="https://github.com/quantumlib/qsim",
-    author="The qsim/qsimh Developers",
-    author_email="qsim-qsimh-dev@googlegroups.com",
-    maintainer="Google Quantum AI",
-    maintainer_email="quantum-oss-maintainers@google.com",
-    python_requires=">=3.10.0",
-    install_requires=requirements,
-    extras_require={
-        "dev": dev_requirements,
-    },
-    license="Apache-2.0",
-    description=description,
-    long_description=long_description,
-    long_description_content_type="text/markdown",
     ext_modules=[
         CMakeExtension("qsimcirq/qsim_avx512"),
         CMakeExtension("qsimcirq/qsim_avx2"),
@@ -170,49 +139,5 @@ def build_extension(self, ext):
         CMakeExtension("qsimcirq/qsim_decide"),
         CMakeExtension("qsimcirq/qsim_hip"),
     ],
-    cmdclass=dict(build_ext=CMakeBuild),
-    zip_safe=False,
-    packages=["qsimcirq"],
-    package_data={"qsimcirq": ["py.typed"]},
-    classifiers=[
-        "Development Status :: 5 - Production/Stable",
-        "Environment :: GPU :: NVIDIA CUDA",
-        "Intended Audience :: Developers",
-        "Intended Audience :: Science/Research",
-        "Operating System :: MacOS :: MacOS X",
-        "Operating System :: Microsoft :: Windows",
-        "Operating System :: POSIX :: Linux",
-        "Programming Language :: C++",
-        "Programming Language :: Python :: 3",
-        "Programming Language :: Python :: 3.10",
-        "Programming Language :: Python :: 3.11",
-        "Programming Language :: Python :: 3.12",
-        "Programming Language :: Python :: 3.13",
-        "Topic :: Scientific/Engineering :: Quantum Computing",
-        "Topic :: Software Development :: Libraries :: Python Modules",
-        "Typing :: Typed",
-    ],
-    keywords=[
-        "algorithms",
-        "api",
-        "application programming interface",
-        "cirq",
-        "google quantum",
-        "google",
-        "nisq",
-        "python",
-        "quantum algorithm development",
-        "quantum circuit simulator",
-        "quantum computer simulator",
-        "quantum computing",
-        "quantum computing research",
-        "quantum programming",
-        "quantum simulation",
-        "quantum",
-        "schrödinger-feynman simulation",
-        "sdk",
-        "simulation",
-        "state vector simulator",
-        "software development kit",
-    ],
+    cmdclass={"build_ext": CMakeBuild},
 )

From 0f7a2b6ba641799f50fc4885f8feca959b15753d Mon Sep 17 00:00:00 2001
From: mhucka <mhucka@google.com>
Date: Fri, 2 Jan 2026 00:59:50 +0000
Subject: [PATCH 02/33] Move contents of dev-requirements.txt into
 pyproject.toml

Remove no-longer-needed dev-requirements.txt. Its contents are in
pyproject.toml now.
---
 dev-requirements.txt | 12 ------------
 1 file changed, 12 deletions(-)
 delete mode 100644 dev-requirements.txt

diff --git a/dev-requirements.txt b/dev-requirements.txt
deleted file mode 100644
index 2a0fc0a36..000000000
--- a/dev-requirements.txt
+++ /dev/null
@@ -1,12 +0,0 @@
-cmake~=3.28.1
-black~=25.9.0
-flynt~=1.0
-isort[colors]~=6.0.1
-# The global option to pybind11 makes it include CMake files in a location where
-# CMake will find them. It makes a crucial difference in some environments.
-pybind11[global]
-pylint~=4.0.2
-pytest
-pytest-xdist
-py-cpuinfo
-setuptools>=78.1.1

From 78383e150a9361a177f875b7a3d977676f2aef9d Mon Sep 17 00:00:00 2001
From: mhucka <mhucka@google.com>
Date: Fri, 2 Jan 2026 01:01:04 +0000
Subject: [PATCH 03/33] Replace references to dev-requirements.txt

The more modern way of handling development dependencies is to put them
in pyproject.toml and then use `pip install --group dev` to install them.
---
 .github/workflows/ci.yaml                | 36 ++++++++++++++++++------
 .github/workflows/cirq_compatibility.yml |  4 +--
 Dockerfile                               | 13 +++++----
 MANIFEST.in                              |  1 -
 docs/install_qsimcirq.md                 | 20 ++++++++-----
 5 files changed, 50 insertions(+), 24 deletions(-)

diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
index d74432949..6fd9d7842 100644
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@@ -73,10 +73,15 @@ jobs:
           cache: pip
           cache-dependency-path: |
             requirements.txt
-            dev-requirements.txt
+            pyproject.toml
+
+      - name: Upgrade pip
+        run: python -m pip install --upgrade pip
 
       - name: Install dependencies
-        run: pip install -r requirements.txt -r dev-requirements.txt
+        run: |
+          pip install -r requirements.txt
+          pip install --group dev
 
       - name: Check format
         continue-on-error: ${{inputs.soft-linting == 'true'}}
@@ -237,10 +242,15 @@ jobs:
           cache: pip
           cache-dependency-path: |
             requirements.txt
-            dev-requirements.txt
+            pyproject.toml
+
+      - name: Upgrade pip
+        run: python -m pip install --upgrade pip
 
       - name: Install dependencies
-        run: pip install -r requirements.txt -r dev-requirements.txt
+        run: |
+          pip install -r requirements.txt
+          pip install --group dev
 
       - name: Set up Bazel
         uses: './.github/actions/set-up-bazel'
@@ -322,10 +332,15 @@ jobs:
           cache: pip
           cache-dependency-path: |
             requirements.txt
-            dev-requirements.txt
+            pyproject.toml
+
+      - name: Upgrade pip
+        run: python -m pip install --upgrade pip
 
       - name: Install dependencies
-        run: pip install -r requirements.txt -r dev-requirements.txt
+        run: |
+          pip install -r requirements.txt
+          pip install --group dev
 
       - name: Set up Bazel
         uses: './.github/actions/set-up-bazel'
@@ -377,10 +392,15 @@ jobs:
           cache: pip
           cache-dependency-path: |
             requirements.txt
-            dev-requirements.txt
+            pyproject.toml
+
+      - name: Upgrade pip
+        run: python -m pip install --upgrade pip
 
       - name: Install dependencies
-        run: pip install -r requirements.txt -r dev-requirements.txt
+        run: |
+          pip install -r requirements.txt
+          pip install --group dev
 
       - name: Set up Bazel
         uses: './.github/actions/set-up-bazel'
diff --git a/.github/workflows/cirq_compatibility.yml b/.github/workflows/cirq_compatibility.yml
index 5fc76ff0f..18d974bef 100644
--- a/.github/workflows/cirq_compatibility.yml
+++ b/.github/workflows/cirq_compatibility.yml
@@ -52,7 +52,7 @@ jobs:
           cache: pip
           cache-dependency-path: |
             requirements.txt
-            dev-requirements.txt
+            pyproject.toml
 
       - name: Install latest dev version of Cirq
         run: pip install --upgrade cirq~=1.0.dev
@@ -60,7 +60,7 @@ jobs:
       - name: Install qsim dev requirements
         run: |
           pip install -r requirements.txt
-          pip install -r dev-requirements.txt
+          pip install --group dev
 
       - name: Run Python tests
         env:
diff --git a/Dockerfile b/Dockerfile
index 7d82f2eb5..07c5bcbab 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -20,7 +20,7 @@ ARG CUDA_PATH
 ENV PATH="$CUDA_PATH/bin:$PATH"
 
 # Update package list & install some basic tools we'll need.
-# hadolint ignore=DL3009,DL3008
+# hadolint ignore=DL3008,DL3009
 RUN apt-get update && \
     apt-get install -y make g++ wget git --no-install-recommends && \
     apt-get install -y python3-dev python3-pip python3-venv --no-install-recommends
@@ -37,8 +37,10 @@ COPY ./circuits/ /qsim/circuits/
 COPY ./lib/ /qsim/lib/
 COPY ./pybind_interface/ /qsim/lib/
 COPY ./qsimcirq_tests/ /qsim/qsimcirq_tests/
+COPY ./pyproject.toml /qsim/pyproject.toml
 COPY ./requirements.txt /qsim/requirements.txt
-COPY ./dev-requirements.txt /qsim/dev-requirements.txt
+
+WORKDIR /qsim/
 
 # Create venv to avoid collision between system packages and what we install.
 RUN python3 -m venv --upgrade-deps test_env
@@ -47,12 +49,11 @@ RUN python3 -m venv --upgrade-deps test_env
 ENV PATH="/test_env/bin:$PATH"
 
 # Install qsim requirements.
-# hadolint ignore=DL3042
-RUN python3 -m pip install -r /qsim/requirements.txt && \
-    python3 -m pip install -r /qsim/dev-requirements.txt
+# hadolint ignore=DL3013
+RUN python3 -m pip install --no-cache-dir --upgrade pip && \
+    python3 -m pip install --no-cache-dir -r requirements.txt
 
 # Compile qsim.
-WORKDIR /qsim/
 RUN make -j qsim
 
 ENTRYPOINT ["/qsim/apps/qsim_base.x"]
diff --git a/MANIFEST.in b/MANIFEST.in
index 4b487267f..2968589be 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,5 +1,4 @@
 include requirements.txt
-include dev-requirements.txt
 include CMakeLists.txt
 
 graft pybind_interface
diff --git a/docs/install_qsimcirq.md b/docs/install_qsimcirq.md
index 6887a57f1..441dc3ead 100644
--- a/docs/install_qsimcirq.md
+++ b/docs/install_qsimcirq.md
@@ -10,18 +10,24 @@ directly in C++ code without building and installing the qsimcirq interface.
 
 ## Before installation
 
-Prior to installation, consider opening a
+Prior to installation, consider creating a
 [virtual environment](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/).
 
-Prerequisites are included in the
+Prerequisites for installing and running qsim are included in the
 [`requirements.txt`](https://github.com/quantumlib/qsim/blob/main/requirements.txt)
-file, and will be automatically installed along with qsimcirq.
+file, and will be automatically installed along with qsimcirq when you install
+it with pip.
 
-If you'd like to develop qsimcirq, a separate set of dependencies are includes
+If you'd like to develop qsimcirq, a separate set of dependencies are defined
 in the
-[`dev-requirements.txt`](https://github.com/quantumlib/qsim/blob/main/dev-requirements.txt)
-file. You can install them with `pip3 install -r dev-requirements.txt` or
-`pip3 install qsimcirq[dev]`.
+[`pyproject.toml`](https://github.com/quantumlib/qsim/blob/main/pyproject.toml)
+file. Using pip version 25.1 or higher, you can install them with the following
+commands:
+
+```shell
+pip install -r requirements.txt
+pip install --group dev
+```
 
 ## Linux installation
 

From 2c772143b0e3d3a6084a3261c17a433288e8f1aa Mon Sep 17 00:00:00 2001
From: mhucka <mhucka@google.com>
Date: Fri, 2 Jan 2026 01:02:32 +0000
Subject: [PATCH 04/33] Add more constraints & comments to requirements.txt

Some transitive dependencies have had version updates that cause
conflicts for some combinations of our builds, such as contourpy
requiring Python 3.11. This puts more constraints on this so that
qsimcirq builds everywhere.
---
 requirements.txt | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 21e8a4ade..a927b3363 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,3 +1,17 @@
+# Installation and run-time dependencies for qsimcirq. This file is read
+# by pyproject.toml.
+
 absl-py
 cirq-core~=1.0
-numpy>=1.26.0
+numpy>=1.26.0,<2.0; python_version < '3.11'
+numpy>=2.0; python_version >= '3.11'
+
+# These are needed because installing qsimcirq in some environments may require
+# pip to compile Pybind for that specific platform:
+cmake~=3.28.1
+pybind11[global]
+
+# These are transitive dependencies we need to constrain to avoid unresolvable
+# installation conflicts due to them requiring higher Python versions:
+scipy<1.16; python_version < '3.11'
+contourpy<1.3; python_version < '3.11'

From aedd47568b979197c3b4fdfd9082142b0ab16288 Mon Sep 17 00:00:00 2001
From: mhucka <mhucka@google.com>
Date: Fri, 2 Jan 2026 01:41:58 +0000
Subject: [PATCH 05/33] More Dockerfile fixes

1. Need to change the workdir at a different point.
2. Need to activate the venv.
---
 Dockerfile | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 07c5bcbab..2dd040373 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -40,10 +40,11 @@ COPY ./qsimcirq_tests/ /qsim/qsimcirq_tests/
 COPY ./pyproject.toml /qsim/pyproject.toml
 COPY ./requirements.txt /qsim/requirements.txt
 
-WORKDIR /qsim/
-
 # Create venv to avoid collision between system packages and what we install.
-RUN python3 -m venv --upgrade-deps test_env
+RUN python3 -m venv --upgrade-deps test_env && \
+    . test_env/bin/activate
+
+WORKDIR /qsim/
 
 # Activate venv.
 ENV PATH="/test_env/bin:$PATH"

From c2d1d3a8c2c64d4afb2511545187908b78bf3ab6 Mon Sep 17 00:00:00 2001
From: mhucka <mhucka@google.com>
Date: Fri, 2 Jan 2026 01:53:24 +0000
Subject: [PATCH 06/33] Remove some changes that belong in a separate PR

---
 pyproject.toml | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index e519fedd2..55235fa18 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -123,7 +123,7 @@ environment.PIP_PREFER_BINARY = "1"
 # Due to package & module name conflict, temporarily move it away to run tests:
 before-test = "pip install --group dev && mv {package}/qsimcirq /tmp"
 test-command = """
-pytest -n auto -s -v {package}/qsimcirq_tests/qsimcirq_test.py &&
+pytest -s -v {package}/qsimcirq_tests/qsimcirq_test.py &&
 mv /tmp/qsimcirq {package}
 """
 
@@ -148,11 +148,3 @@ skip = "*musllinux*"
 [tool.black]
 target-version = ['py310', 'py311', 'py312', 'py313']
 extend-exclude = 'third_party'
-
-[tool.isort]
-profile = 'black'
-order_by_type = false  # Sort alphabetically, irrespective of case.
-skip_gitignore = true
-combine_as_imports = true
-known_first_party = ["cirq*"]
-extend_skip = ["__init__.py"]

From 59c8cb160b255c2dcbc890843186721a0318b09e Mon Sep 17 00:00:00 2001
From: mhucka <mhucka@google.com>
Date: Fri, 2 Jan 2026 01:55:40 +0000
Subject: [PATCH 07/33] Remove a spurious change to reduce the diff noise

---
 Dockerfile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 2dd040373..100c68e92 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -20,7 +20,7 @@ ARG CUDA_PATH
 ENV PATH="$CUDA_PATH/bin:$PATH"
 
 # Update package list & install some basic tools we'll need.
-# hadolint ignore=DL3008,DL3009
+# hadolint ignore=DL3009,DL3008
 RUN apt-get update && \
     apt-get install -y make g++ wget git --no-install-recommends && \
     apt-get install -y python3-dev python3-pip python3-venv --no-install-recommends
@@ -37,8 +37,8 @@ COPY ./circuits/ /qsim/circuits/
 COPY ./lib/ /qsim/lib/
 COPY ./pybind_interface/ /qsim/lib/
 COPY ./qsimcirq_tests/ /qsim/qsimcirq_tests/
-COPY ./pyproject.toml /qsim/pyproject.toml
 COPY ./requirements.txt /qsim/requirements.txt
+COPY ./pyproject.toml /qsim/pyproject.toml
 
 # Create venv to avoid collision between system packages and what we install.
 RUN python3 -m venv --upgrade-deps test_env && \

From 794c818c323c7dc41e89e71122db8abd036ec8c5 Mon Sep 17 00:00:00 2001
From: mhucka <mhucka@google.com>
Date: Fri, 2 Jan 2026 02:12:27 +0000
Subject: [PATCH 08/33] Need to install setuptools on Python >= 3.12

---
 pyproject.toml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/pyproject.toml b/pyproject.toml
index 55235fa18..a2d82787f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -96,6 +96,8 @@ source = "https://github.com/quantumlib/qsim"
 dev = [
     "black~=25.9.0",
     "cibuildwheel",
+    # Distutils was removed from Python in 3.12.
+    "setuptools; python_version >= 3.12",
     "flynt~=1.0",
     "isort[colors]~=6.0.1",
     "py-cpuinfo",

From 3b4102a991a845f3fca73db8d295bceff4559cd0 Mon Sep 17 00:00:00 2001
From: mhucka <mhucka@google.com>
Date: Fri, 2 Jan 2026 02:14:01 +0000
Subject: [PATCH 09/33] Fix syntax

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index a2d82787f..a4f3e4613 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -97,7 +97,7 @@ dev = [
     "black~=25.9.0",
     "cibuildwheel",
     # Distutils was removed from Python in 3.12.
-    "setuptools; python_version >= 3.12",
+    "setuptools; python_version >= '3.12'",
     "flynt~=1.0",
     "isort[colors]~=6.0.1",
     "py-cpuinfo",

From 2a565d1a27aec7b4b911e9accc084258f994ed32 Mon Sep 17 00:00:00 2001
From: mhucka <mhucka@google.com>
Date: Fri, 2 Jan 2026 02:31:53 +0000
Subject: [PATCH 10/33] Need to install Python dev dependencies

---
 pybind_interface/Dockerfile | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/pybind_interface/Dockerfile b/pybind_interface/Dockerfile
index 54a4ff511..2b1cb4fd0 100644
--- a/pybind_interface/Dockerfile
+++ b/pybind_interface/Dockerfile
@@ -26,5 +26,9 @@ WORKDIR /qsim/
 # Build pybind code early to cache the results
 RUN make -j -C /qsim/ pybind
 
+# Install Python development dependencies.
+# hadolint ignore=DL3013
+RUN pip install --no-cache-dir --group dev
+
 # Compile and run qsim tests
 ENTRYPOINT ["make", "-C", "/qsim/", "run-py-tests"]

From 80f980e86b649229718904b614e6b9523327137a Mon Sep 17 00:00:00 2001
From: Sergei Isakov <54642992+sergeisakov@users.noreply.github.com>
Date: Mon, 5 Jan 2026 07:30:24 +0100
Subject: [PATCH 11/33] Add experimental support for cuStateVecEx. (#965)

---
 CMakeLists.txt                                |   1 +
 Makefile                                      |  19 +
 apps/Makefile                                 |   9 +
 apps/make.sh                                  |   3 +-
 apps/qsim_base_custatevecex.cu                | 160 +++++
 docs/cirq_interface.md                        |   8 +-
 lib/BUILD                                     |  73 +++
 lib/io.h                                      |  12 +
 lib/io_file.h                                 |   4 +
 lib/multiprocess_custatevecex.h               | 213 ++++++
 lib/run_custatevecex.h                        | 313 +++++++++
 lib/simulator_custatevec.h                    |  17 +-
 lib/simulator_custatevecex.h                  | 243 +++++++
 lib/statespace_custatevec.h                   |   6 +-
 lib/statespace_custatevecex.h                 | 431 +++++++++++++
 lib/util_cuda.h                               |  19 +-
 lib/util_custatevec.h                         |   4 +-
 lib/util_custatevecex.h                       |  46 ++
 lib/vectorspace_custatevecex.h                | 610 ++++++++++++++++++
 pybind_interface/Makefile                     |  10 +-
 pybind_interface/cuda/CMakeLists.txt          |   2 +-
 pybind_interface/custatevec/CMakeLists.txt    |   2 +-
 pybind_interface/custatevecex/CMakeLists.txt  |  59 ++
 .../custatevecex/pybind_main_custatevecex.cpp |  74 +++
 .../custatevecex/pybind_main_custatevecex.h   |  17 +
 pybind_interface/decide/CMakeLists.txt        |   5 +-
 pybind_interface/decide/decide.cpp            |  20 +-
 pybind_interface/hip/CMakeLists.txt           |   2 +-
 qsimcirq/__init__.py                          |  12 +-
 qsimcirq/qsim_simulator.py                    |  25 +-
 qsimcirq_tests/qsimcirq_test.py               | 109 ++++
 setup.py                                      |   1 +
 tests/Makefile                                |  17 +
 tests/hybrid_custatevecex_test.cu             |  59 ++
 tests/qtrajectory_custatevecex_test.cu        |  88 +++
 tests/run_custatevecex_test.cu                | 262 ++++++++
 tests/simulator_custatevecex_test.cu          | 105 +++
 tests/simulator_testfixture.h                 |  35 +-
 tests/statespace_custatevecex_test.cu         | 119 ++++
 39 files changed, 3167 insertions(+), 47 deletions(-)
 create mode 100644 apps/qsim_base_custatevecex.cu
 create mode 100644 lib/multiprocess_custatevecex.h
 create mode 100644 lib/run_custatevecex.h
 create mode 100644 lib/simulator_custatevecex.h
 create mode 100644 lib/statespace_custatevecex.h
 create mode 100644 lib/util_custatevecex.h
 create mode 100644 lib/vectorspace_custatevecex.h
 create mode 100644 pybind_interface/custatevecex/CMakeLists.txt
 create mode 100644 pybind_interface/custatevecex/pybind_main_custatevecex.cpp
 create mode 100644 pybind_interface/custatevecex/pybind_main_custatevecex.h
 create mode 100644 tests/hybrid_custatevecex_test.cu
 create mode 100644 tests/qtrajectory_custatevecex_test.cu
 create mode 100644 tests/run_custatevecex_test.cu
 create mode 100644 tests/simulator_custatevecex_test.cu
 create mode 100644 tests/statespace_custatevecex_test.cu

diff --git a/CMakeLists.txt b/CMakeLists.txt
index e8a92a47b..8b824c81d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -64,6 +64,7 @@ if(NOT CMAKE_APPLE_SILICON_PROCESSOR)
         add_subdirectory(pybind_interface/cuda)
         if(DEFINED ENV{CUQUANTUM_ROOT})
             add_subdirectory(pybind_interface/custatevec)
+            add_subdirectory(pybind_interface/custatevecex)
         endif()
     elseif(has_hipcc)
         add_subdirectory(pybind_interface/hip)
diff --git a/Makefile b/Makefile
index b4e16da06..4b37317c2 100644
--- a/Makefile
+++ b/Makefile
@@ -94,7 +94,10 @@ ifneq (,$(strip $(CUQUANTUM_ROOT)))
         CUSVFLAGS += -lcustatevec -lcublas
         CUSTATEVECFLAGS ?= $(CUSVFLAGS)
         TARGETS += qsim-custatevec
+        TARGETS += qsim-custatevecex
         TESTS += run-custatevec-tests
+        TESTS += run-custatevecex-tests
+	TESTS += run-custatevecex-mpi-tests
     else
         $(warning $$CUQUANTUM_ROOT is set, but the path does not seem to exist)
     endif
@@ -120,6 +123,10 @@ qsim-cuda:
 qsim-custatevec: | check-cuquantum-root-set
 	$(MAKE) -C apps/ qsim-custatevec
 
+.PHONY: qsim-custatevecex
+qsim-custatevecex: | check-cuquantum-root-set
+	$(MAKE) -C apps/ qsim-custatevecex
+
 .PHONY: qsim-hip
 qsim-hip:
 	$(MAKE) -C apps/ qsim-hip
@@ -140,6 +147,10 @@ cuda-tests:
 custatevec-tests: | check-cuquantum-root-set
 	$(MAKE) -C tests/ custatevec-tests
 
+.PHONY: custatevecex-tests
+custatevecex-tests: | check-cuquantum-root-set
+	$(MAKE) -C tests/ custatevecex-tests
+
 .PHONY: hip-tests
 hip-tests:
 	$(MAKE) -C tests/ hip-tests
@@ -156,6 +167,14 @@ run-cuda-tests: cuda-tests
 run-custatevec-tests: custatevec-tests
 	$(MAKE) -C tests/ run-custatevec-tests
 
+.PHONY: run-custatevecex-tests
+run-custatevecex-tests: custatevecex-tests
+	$(MAKE) -C tests/ run-custatevecex-tests
+
+.PHONY: run-custatevecex-mpi-tests
+run-custatevecex-mpi-tests: custatevecex-tests
+	$(MAKE) -C tests/ run-custatevecex-mpi-tests
+
 .PHONY: run-hip-tests
 run-hip-tests: hip-tests
 	$(MAKE) -C tests/ run-hip-tests
diff --git a/apps/Makefile b/apps/Makefile
index 48b25cabd..19ccbc422 100644
--- a/apps/Makefile
+++ b/apps/Makefile
@@ -7,6 +7,9 @@ CUDA_TARGETS := $(CUDA_TARGETS:%cuda.cu=%cuda.x)
 CUSTATEVEC_TARGETS = $(shell find . -maxdepth 1 -name "*custatevec.cu")
 CUSTATEVEC_TARGETS := $(CUSTATEVEC_TARGETS:%custatevec.cu=%custatevec.x)
 
+CUSTATEVECEX_TARGETS = $(shell find . -maxdepth 1 -name "*custatevecex.cu")
+CUSTATEVECEX_TARGETS := $(CUSTATEVECEX_TARGETS:%custatevecex.cu=%custatevecex.x)
+
 HIP_TARGETS = $(shell find . -maxdepth 1 -name '*cuda.cu')
 HIP_TARGETS := $(HIP_TARGETS:%cuda.cu=%hip.x)
 
@@ -19,6 +22,9 @@ qsim-cuda: $(CUDA_TARGETS)
 .PHONY: qsim-custatevec
 qsim-custatevec: $(CUSTATEVEC_TARGETS)
 
+.PHONY: qsim-custatevecex
+qsim-custatevecex: $(CUSTATEVECEX_TARGETS)
+
 .PHONY: qsim-hip
 qsim-hip: $(HIP_TARGETS)
 
@@ -31,6 +37,9 @@ qsim-hip: $(HIP_TARGETS)
 %custatevec.x: %custatevec.cu
 	$(NVCC) -o ./$@ $< $(NVCCFLAGS) $(CUSTATEVECFLAGS)
 
+%custatevecex.x: %custatevecex.cu
+	$(NVCC) -o ./$@ $< $(NVCCFLAGS) $(CUSTATEVECFLAGS)
+
 %hip.x: %cuda.cu
 	$(HIPCC) -o ./$@ $< $(HIPCCFLAGS)
 
diff --git a/apps/make.sh b/apps/make.sh
index 610b2eb4f..7ebf00c7b 100755
--- a/apps/make.sh
+++ b/apps/make.sh
@@ -37,7 +37,8 @@ if command -v nvcc &>/dev/null; then
         )
         nvcc -O3 "${CUSTATEVECFLAGS[@]}" \
              -o qsim_base_custatevec.x qsim_base_custatevec.cu
-
+        nvcc -O3 "${CUSTATEVECFLAGS[@]}" \
+             -o qsim_base_custatevecex.x qsim_base_custatevecex.cu
     fi
 elif command -v hipcc &>/dev/null; then
     hipcc -O3 -o qsim_base_hip.x qsim_base_cuda.cu
diff --git a/apps/qsim_base_custatevecex.cu b/apps/qsim_base_custatevecex.cu
new file mode 100644
index 000000000..99ce1a283
--- /dev/null
+++ b/apps/qsim_base_custatevecex.cu
@@ -0,0 +1,160 @@
+// Copyright 2025 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <unistd.h>
+
+#include <algorithm>
+#include <complex>
+#include <limits>
+#include <string>
+
+#include "../lib/circuit_qsim_parser.h"
+#include "../lib/formux.h"
+#include "../lib/gates_qsim.h"
+#include "../lib/io_file.h"
+#include "../lib/multiprocess_custatevecex.h"
+#include "../lib/run_custatevecex.h"
+#include "../lib/simulator_custatevecex.h"
+#include "../lib/util_custatevec.h"
+
+struct Options {
+  std::string circuit_file;
+  unsigned maxtime = std::numeric_limits<unsigned>::max();
+  unsigned seed = 1;
+  unsigned verbosity = 0;
+};
+
+Options GetOptions(int argc, char* argv[]) {
+  constexpr char usage[] = "usage:\n  ./qsim_base -c circuit -d maxtime "
+                           "-s seed -v verbosity\n";
+
+  Options opt;
+
+  int k;
+
+  while ((k = getopt(argc, argv, "c:d:s:v:")) != -1) {
+    switch (k) {
+      case 'c':
+        opt.circuit_file = optarg;
+        break;
+      case 'd':
+        opt.maxtime = std::atoi(optarg);
+        break;
+      case 's':
+        opt.seed = std::atoi(optarg);
+        break;
+      case 'v':
+        opt.verbosity = std::atoi(optarg);
+        break;
+      default:
+        qsim::IO::errorf(usage);
+        exit(1);
+    }
+  }
+
+  return opt;
+}
+
+bool ValidateOptions(const Options& opt) {
+  if (opt.circuit_file.empty()) {
+    qsim::IO::errorf("circuit file is not provided.\n");
+    return false;
+  }
+
+  return true;
+}
+
+template <typename StateSpace, typename State>
+void PrintAmplitudes(
+    unsigned num_qubits, const StateSpace& state_space, const State& state) {
+  static constexpr char const* bits[8] = {
+    "000", "001", "010", "011", "100", "101", "110", "111",
+  };
+
+  uint64_t size = std::min(uint64_t{8}, uint64_t{1} << num_qubits);
+  unsigned s = 3 - std::min(unsigned{3}, num_qubits);
+
+  for (uint64_t i = 0; i < size; ++i) {
+    auto a = state_space.GetAmpl(state, i);
+    qsim::IO::messagef("%s:%16.8g%16.8g%16.8g\n",
+                       bits[i] + s, std::real(a), std::imag(a), std::norm(a));
+  }
+}
+
+int main(int argc, char* argv[]) {
+  using namespace qsim;
+
+  auto opt = GetOptions(argc, argv);
+  if (!ValidateOptions(opt)) {
+    return 1;
+  }
+
+  using fp_type = float;
+
+  Circuit<GateQSim<fp_type>> circuit;
+  if (!CircuitQsimParser<IOFile>::FromFile(opt.maxtime, opt.circuit_file,
+                                           circuit)) {
+    return 1;
+  }
+
+  struct Factory {
+    using Simulator = qsim::SimulatorCuStateVecEx<fp_type>;
+    using StateSpace = Simulator::StateSpace;
+
+    explicit Factory(unsigned verbosity = 0) : verbosity(verbosity) {
+      mp.initialize();
+    }
+
+    StateSpace CreateStateSpace() const {
+      StateSpace::Parameter param;
+      param.verbosity = verbosity;
+
+      return StateSpace{mp, param};
+    }
+
+    Simulator CreateSimulator() const {
+      return Simulator{};
+    }
+
+    MultiProcessCuStateVecEx mp;
+    unsigned verbosity;
+  };
+
+  using Simulator = Factory::Simulator;
+  using StateSpace = Simulator::StateSpace;
+  using State = StateSpace::State;
+  using Runner = CuStateVecExRunner<IO, Factory>;
+
+  Factory factory(opt.verbosity);
+
+  StateSpace state_space = factory.CreateStateSpace();
+  State state = state_space.Create(circuit.num_qubits);
+
+  if (state_space.IsNull(state)) {
+    IO::errorf("not enough memory: is the number of qubits too large?\n");
+    return 1;
+  }
+
+  state_space.SetStateZero(state);
+
+  Runner::Parameter param;
+  param.seed = opt.seed;
+  param.verbosity = opt.verbosity;
+
+  if (Runner::Run(param, factory, circuit, state)) {
+    PrintAmplitudes(circuit.num_qubits, state_space, state);
+  }
+
+  return 0;
+}
diff --git a/docs/cirq_interface.md b/docs/cirq_interface.md
index 593da1700..5dd1ddb4f 100644
--- a/docs/cirq_interface.md
+++ b/docs/cirq_interface.md
@@ -186,8 +186,11 @@ library.
 `QSimOptions` provides five parameters to configure GPU execution. `use_gpu`
 is required to enable GPU execution:
 * `use_gpu`: if True, use GPU instead of CPU for simulation.
-* `gpu_mode`: use CUDA if set to 0 (default value) or use the NVIDIA cuStateVec
-library if set to any other value.
+* `gpu_mode`: use CUDA if set to 0 (default value), use the NVIDIA cuStateVec
+if set to 1 or use the NVIDIA cuStateVecEx library if set to any other value.
+
+In the case of the NVIDIA cuStateVecEx library, simulations can be performed
+in multi-device / multi-node environments.
 
 If `use_gpu` is set and `gpu_mode` is set to 0, the remaining parameters can
 optionally be set to fine-tune StateSpace performance for a specific device.
@@ -196,3 +199,4 @@ In most cases, the default values provide good performance.
 StateSpace. This must be a power of 2 in the range [32, 1024].
 * `gpu_data_blocks`: number of data blocks to use for the GPU StateSpace.
 Below 16 data blocks, performance is noticeably reduced.
+
diff --git a/lib/BUILD b/lib/BUILD
index 02aa71bb0..60fd0e51a 100644
--- a/lib/BUILD
+++ b/lib/BUILD
@@ -186,8 +186,10 @@ cuda_library(
         "matrix.h",
         "mps_simulator.h",
         "mps_statespace.h",
+        "multiprocess_custatevecex.h",
         "parfor.h",
         "qtrajectory.h",
+        "run_custatevecex.h",
         "run_qsim.h",
         "run_qsimh.h",
         "seqfor.h",
@@ -198,12 +200,14 @@ cuda_library(
         "simulator_avx512.h",
         "simulator_basic.h",
         "simulator_custatevec.h",
+        "simulator_custatevecex.h",
         "simulator_sse.h",
         "statespace.h",
         "statespace_avx.h",
         "statespace_avx512.h",
         "statespace_basic.h",
         "statespace_custatevec.h",
+        "statespace_custatevecex.h",
         "statespace_sse.h",
         "umux.h",
         "unitary_calculator_avx.h",
@@ -219,8 +223,10 @@ cuda_library(
         "util_cpu.h",
         "util_cuda.h",
         "util_custatevec.h",
+        "util_custatevecex.h",
         "vectorspace.h",
         "vectorspace_cuda.h",
+        "vectorspace_custatevecex.h",
     ],
     copts = ["-D__CUSTATEVEC__"],
     deps = [
@@ -357,6 +363,11 @@ cuda_library(
     hdrs = ["util_custatevec.h"],
 )
 
+cuda_library(
+    name = "util_custatevecex",
+    hdrs = ["util_custatevecex.h"],
+)
+
 ### Input/output libraries ###
 
 cc_library(
@@ -506,6 +517,29 @@ cc_library(
     ],
 )
 
+cuda_library(
+    name = "run_custatevecex",
+    hdrs = ["run_custatevecex.h"],
+    deps = [
+        ":circuit",
+        ":util",
+        ":util_custatevec",
+        ":util_custatevecex",
+    ],
+)
+
+### Multi-process library ###
+
+cuda_library(
+    name = "multiprocess_custatevecex",
+    hdrs = ["multiprocess_custatevecex.h"],
+    deps = [
+        ":io",
+        ":util_custatevec",
+        ":util_custatevecex",
+    ],
+)
+
 ### Vectorspace libraries ###
 
 cc_library(
@@ -518,6 +552,18 @@ cuda_library(
     hdrs = ["vectorspace_cuda.h"],
 )
 
+cuda_library(
+    name = "vectorspace_custatevecex",
+    hdrs = ["vectorspace_custatevecex.h"],
+    deps = [
+        "io",
+        ":multiprocess_custatevecex",
+        ":util_cuda",
+        ":util_custatevec",
+        ":util_custatevecex",
+    ],
+)
+
 ### Statespace libraries ###
 
 cc_library(
@@ -591,6 +637,20 @@ cuda_library(
     ],
 )
 
+cuda_library(
+    name = "statespace_custatevecex",
+    hdrs = [
+        "statespace_custatevecex.h",
+    ],
+    deps = [
+        ":multiprocess_custatevecex",
+        ":statespace",
+        ":util_custatevec",
+        ":util_custatevecex",
+        ":vectorspace_custatevecex",
+    ],
+)
+
 ### Simulator libraries ###
 
 cc_library(
@@ -660,6 +720,19 @@ cuda_library(
     ],
 )
 
+cuda_library(
+    name = "simulator_custatevecex",
+    hdrs = [
+        "simulator_custatevecex.h",
+    ],
+    deps = [
+        ":io",
+        ":statespace_custatevecex",
+        ":util_custatevec",
+        ":util_custatevecex",
+    ],
+)
+
 # All three state-vector simulators with multiplexer
 cc_library(
     name = "simulator",
diff --git a/lib/io.h b/lib/io.h
index 3b26c7cc6..97de5fc12 100644
--- a/lib/io.h
+++ b/lib/io.h
@@ -20,11 +20,19 @@
 
 namespace qsim {
 
+namespace output {
+  static bool enabled = true;
+}
+
 /**
  * Controller for output logs.
  */
 struct IO {
   static void errorf(const char* format, ...) {
+    if (!output::enabled) {
+      return;
+    }
+
     va_list args;
     va_start(args, format);
     vfprintf(stderr, format, args);
@@ -32,6 +40,10 @@ struct IO {
   }
 
   static void messagef(const char* format, ...) {
+    if (!output::enabled) {
+      return;
+    }
+
     va_list args;
     va_start(args, format);
     vprintf(format, args);
diff --git a/lib/io_file.h b/lib/io_file.h
index 3cfac12db..789efbd60 100644
--- a/lib/io_file.h
+++ b/lib/io_file.h
@@ -47,6 +47,10 @@ struct IOFile : public IO {
 
   static bool WriteToFile(
       const std::string& file, const void* data, uint64_t size) {
+    if (!output::enabled) {
+      return true;
+    }
+
     auto fs = std::fstream(file, std::ios::out | std::ios::binary);
 
     if (!fs) {
diff --git a/lib/multiprocess_custatevecex.h b/lib/multiprocess_custatevecex.h
new file mode 100644
index 000000000..9c4a13bb1
--- /dev/null
+++ b/lib/multiprocess_custatevecex.h
@@ -0,0 +1,213 @@
+// Copyright 2025 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MULTIPROCESS_CUSTATEVECEX_H_
+#define MULTIPROCESS_CUSTATEVECEX_H_
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <custatevecEx.h>
+#include <custatevecEx_ext.h>
+
+#include <cstdint>
+#include <vector>
+
+#include "io.h"
+#include "util_custatevec.h"
+#include "util_custatevecex.h"
+
+namespace qsim {
+
+struct MultiProcessCuStateVecEx {
+  enum NetworkType {
+    kSuperPod = 0,
+    kGB200NVL = 1,
+    kSwitchTree = 2,
+    kCommunicator = 3,
+  };
+
+  struct Parameter {
+    uint64_t transfer_buffer_size = 16777216;
+    NetworkType network_type = kSuperPod;
+  };
+
+  MultiProcessCuStateVecEx(Parameter param = Parameter{16777216, kSuperPod})
+      : param_(param), communicator_(nullptr), initialized_(false) {}
+
+  ~MultiProcessCuStateVecEx() {
+    if (communicator_) {
+      custatevecExCommunicatorDestroy(communicator_);
+    }
+
+    custatevecExCommunicatorStatus_t status;
+    custatevecExCommunicatorFinalize(&status);
+  }
+
+  custatevecExCommunicatorDescriptor_t communicator() const {
+    return communicator_;
+  }
+
+  unsigned num_processes() const {
+    return num_processes_;
+  }
+
+  unsigned rank() const {
+    return rank_;
+  }
+
+  bool initialized() const {
+    return initialized_;
+  }
+
+  void initialize() {
+    int argc = 0;
+    char** argv = nullptr;
+
+    auto comm_type = CUSTATEVEC_COMMUNICATOR_TYPE_OPENMPI;
+
+    custatevecExCommunicatorStatus_t comm_status;
+    auto status = custatevecExCommunicatorInitialize(
+        comm_type, nullptr, &argc, &argv, &comm_status);
+
+    if (status != CUSTATEVEC_STATUS_SUCCESS ||
+        comm_status != CUSTATEVEC_EX_COMMUNICATOR_STATUS_SUCCESS) {
+      return;
+    }
+
+    communicator_ = nullptr;
+    status = custatevecExCommunicatorCreate(&communicator_);
+
+    if (status != CUSTATEVEC_STATUS_SUCCESS) {
+      return;
+    }
+
+    int num_processes, rank;
+    ErrorCheck(communicator_->intf->getSize(communicator_, &num_processes));
+    ErrorCheck(communicator_->intf->getRank(communicator_, &rank));
+
+    ErrorCheck(communicator_->intf->getRank(communicator_, &rank));
+    if (rank != 0) {
+      output::enabled = false;
+    }
+
+    if (num_processes < 2 || (num_processes & (num_processes - 1)) != 0) {
+      return;
+    }
+
+    num_global_qubits_ = get_num_global_qubits(num_processes);
+
+    unsigned num_acc_global_qubits = 0;
+    auto network_layers = get_network_layers(param_.network_type);
+
+    num_global_qubits_per_layer_.reserve(2);
+    global_index_bit_classes_.reserve(2);
+
+    for (const auto& layer : network_layers) {
+      auto k = num_global_qubits_ - num_acc_global_qubits;
+      global_index_bit_classes_.push_back(layer.global_index_bit_class);
+
+      if (layer.num_global_qubits == 0 || k <= layer.num_global_qubits) {
+        num_global_qubits_per_layer_.push_back(k);
+        num_acc_global_qubits = num_global_qubits_;
+        break;
+      }
+
+      num_global_qubits_per_layer_.push_back(layer.num_global_qubits);
+      num_acc_global_qubits += layer.num_global_qubits;
+    }
+
+    if (num_acc_global_qubits < num_global_qubits_) {
+      IO::errorf("erorr: too few network layers at %s %d.\n",
+                 __FILE__, __LINE__);
+      exit(1);
+    }
+
+    memory_sharing_method_ = CUSTATEVEC_EX_MEMORY_SHARING_METHOD_NONE;
+
+    for (const auto& layer : network_layers) {
+      if (layer.global_index_bit_class ==
+          CUSTATEVEC_EX_GLOBAL_INDEX_BIT_CLASS_INTERPROC_P2P) {
+        memory_sharing_method_ = CUSTATEVEC_EX_MEMORY_SHARING_METHOD_AUTODETECT;
+        break;
+      }
+    }
+
+    num_processes_ = num_processes;
+    rank_ = rank;
+    initialized_ = true;
+  }
+
+  auto create_sv_config(unsigned num_qubits, cudaDataType_t data_type) const {
+    custatevecExDictionaryDescriptor_t sv_config = nullptr;
+
+    if (!initialized_ ||
+        num_qubits < 3 || num_global_qubits_ + 2 > num_qubits) {
+      return sv_config;
+    }
+
+    unsigned num_local_qubits = num_qubits - num_global_qubits_;
+
+    ErrorCheck(custatevecExConfigureStateVectorMultiProcess(
+        &sv_config, data_type, num_qubits, num_local_qubits, -1,
+        memory_sharing_method_, global_index_bit_classes_.data(),
+        reinterpret_cast<const int32_t*>(num_global_qubits_per_layer_.data()),
+        static_cast<int32_t>(global_index_bit_classes_.size()),
+        param_.transfer_buffer_size, nullptr, 0));
+
+    return sv_config;
+  }
+
+ private:
+  Parameter param_;
+  custatevecExCommunicatorDescriptor_t communicator_;
+  std::vector<unsigned> num_global_qubits_per_layer_;
+  std::vector<custatevecExGlobalIndexBitClass_t> global_index_bit_classes_;
+  custatevecExMemorySharingMethod_t memory_sharing_method_;
+  unsigned num_processes_;
+  unsigned num_global_qubits_;
+  unsigned rank_;
+  bool initialized_;
+
+  struct NetworkLayer {
+    custatevecExGlobalIndexBitClass_t global_index_bit_class;
+    unsigned num_global_qubits;
+  };
+
+  using NetworkLayers = std::vector<NetworkLayer>;
+
+  static NetworkLayers get_network_layers(NetworkType id) {
+    switch (id) {
+    case kSuperPod:
+      return {{CUSTATEVEC_EX_GLOBAL_INDEX_BIT_CLASS_INTERPROC_P2P, 3},
+             {CUSTATEVEC_EX_GLOBAL_INDEX_BIT_CLASS_COMMUNICATOR, 0}};
+    case kGB200NVL:
+      return {{CUSTATEVEC_EX_GLOBAL_INDEX_BIT_CLASS_INTERPROC_P2P, 0}};
+      break;
+    case kSwitchTree:
+      return {{CUSTATEVEC_EX_GLOBAL_INDEX_BIT_CLASS_INTERPROC_P2P, 2},
+             {CUSTATEVEC_EX_GLOBAL_INDEX_BIT_CLASS_INTERPROC_P2P, 1}};
+      break;
+    case kCommunicator:
+      return {{CUSTATEVEC_EX_GLOBAL_INDEX_BIT_CLASS_COMMUNICATOR, 0}};
+      break;
+    }
+
+    return NetworkLayers{};
+  }
+};
+
+}  // namespace qsim
+
+#endif  // MULTIPROCESS_CUSTATEVECEX_H_
diff --git a/lib/run_custatevecex.h b/lib/run_custatevecex.h
new file mode 100644
index 000000000..2a2b8b1da
--- /dev/null
+++ b/lib/run_custatevecex.h
@@ -0,0 +1,313 @@
+// Copyright 2025 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef RUN_CUSTATEVECEX_H_
+#define RUN_CUSTATEVECEX_H_
+
+#include <random>
+#include <string>
+#include <vector>
+
+#include <custatevecEx.h>
+
+#include "circuit.h"
+#include "util.h"
+#include "util_custatevec.h"
+#include "util_custatevecex.h"
+
+namespace qsim {
+
+/**
+ * Helper struct for running qsim with the cuStateVecEx library.
+ */
+template <typename IO, typename Factory, typename RGen = std::mt19937>
+struct CuStateVecExRunner final {
+ public:
+  using Simulator = typename Factory::Simulator;
+  using StateSpace = typename Simulator::StateSpace;
+  using State = typename StateSpace::State;
+  using MeasurementResult = typename StateSpace::MeasurementResult;
+
+  /**
+   * User-specified parameters for simulation.
+   */
+  struct Parameter {
+    /**
+     * Random number generator seed to apply measurement gates.
+     */
+    uint64_t seed;
+
+    unsigned verbosity = 0;
+  };
+
+  /**
+   * Runs the given circuit, only measuring at the end.
+   * @param param Options for gate fusion, parallelism and logging.
+   * @param factory Object to create simulators and state spaces.
+   * @param circuit The circuit to be simulated.
+   * @param measure Function that performs measurements (in the sense of
+   *   computing expectation values, etc).
+   * @return True if the simulation completed successfully; false otherwise.
+   */
+  template <typename Circuit, typename MeasurementFunc>
+  static bool Run(const Parameter& param, const Factory& factory,
+                  const Circuit& circuit, MeasurementFunc measure) {
+    return Run(param, factory, {circuit.gates.back().time}, circuit, measure);
+  }
+
+  /**
+   * Runs the given circuit, measuring at user-specified times.
+   * @param param Options for gate fusion, parallelism and logging.
+   * @param factory Object to create simulators and state spaces.
+   * @param times_to_measure_at Time steps at which to perform measurements.
+   * @param circuit The circuit to be simulated.
+   * @param measure Function that performs measurements (in the sense of
+   *   computing expectation values, etc).
+   * @return True if the simulation completed successfully; false otherwise.
+   */
+  template <typename Circuit, typename MeasurementFunc>
+  static bool Run(const Parameter& param, const Factory& factory,
+                  const std::vector<unsigned>& times_to_measure_at,
+                  const Circuit& circuit, MeasurementFunc measure) {
+    std::vector<MeasurementResult> discarded_results;
+
+    StateSpace state_space = factory.CreateStateSpace();
+    Simulator simulator = factory.CreateSimulator();
+
+    auto state = state_space.Create(circuit.num_qubits);
+    if (state_space.IsNull(state)) {
+      IO::errorf("not enough memory: is the number of qubits too large?\n");
+      return false;
+    }
+
+    state_space.SetStateZero(state);
+
+    return Run(param, circuit, state_space, simulator, state,
+               times_to_measure_at, measure, discarded_results);
+  }
+
+  /**
+   * Runs the given circuit and make the final state available to the caller,
+   * recording the result of any intermediate measurements in the circuit.
+   * @param param Options for gate fusion, parallelism and logging.
+   * @param factory Object to create simulators and state spaces.
+   * @param circuit The circuit to be simulated.
+   * @param state As an input parameter, this should contain the initial state
+   *   of the system. After a successful run, it will be populated with the
+   *   final state of the system.
+   * @param measure_results As an input parameter, this should be empty.
+   *   After a successful run, this will contain all measurements results from
+   *   the run, ordered by time and qubit index.
+   * @return True if the simulation completed successfully; false otherwise.
+   */
+  template <typename Circuit>
+  static bool Run(const Parameter& param, const Factory& factory,
+                  const Circuit& circuit, State& state,
+                  std::vector<MeasurementResult>& measure_results) {
+    auto measure = [](unsigned, const StateSpace&, const State&) {};
+
+    StateSpace state_space = factory.CreateStateSpace();
+    Simulator simulator = factory.CreateSimulator();
+
+    return Run(param, circuit, state_space, simulator, state,
+               {}, measure, measure_results);
+  }
+
+  /**
+   * Runs the given circuit and make the final state available to the caller,
+   * discarding the result of any intermediate measurements in the circuit.
+   * @param param Options for gate fusion, parallelism and logging.
+   * @param factory Object to create simulators and state spaces.
+   * @param circuit The circuit to be simulated.
+   * @param state As an input parameter, this should contain the initial state
+   *   of the system. After a successful run, it will be populated with the
+   *   final state of the system.
+   * @return True if the simulation completed successfully; false otherwise.
+   */
+  template <typename Circuit>
+  static bool Run(const Parameter& param, const Factory& factory,
+                  const Circuit& circuit, State& state) {
+    auto measure = [](unsigned, const StateSpace&, const State&) {};
+
+    StateSpace state_space = factory.CreateStateSpace();
+    Simulator simulator = factory.CreateSimulator();
+
+    std::vector<MeasurementResult> discarded_results;
+
+    return Run(param, circuit, state_space, simulator, state,
+               {}, measure, discarded_results);
+  }
+
+  /**
+   * Runs the given circuit and make the final state available to the caller,
+   * recording the result of any intermediate measurements in the circuit.
+   * @param param Options for gate fusion, parallelism and logging.
+   * @param circuit The circuit to be simulated.
+   * @param state_space StateSpace object required to perform measurements.
+   * @param simulator Simulator object. Provides specific implementations for
+   *   applying gates.
+   * @param state As an input parameter, this should contain the initial state
+   *   of the system. After a successful run, it will be populated with the
+   *   final state of the system.
+   * @param measure_results As an input parameter, this should be empty.
+   *   After a successful run, this will contain all measurements results from
+   *   the run, ordered by time and qubit index.
+   * @return True if the simulation completed successfully; false otherwise.
+   */
+  template <typename Circuit>
+  static bool Run(const Parameter& param, const Circuit& circuit,
+                  const StateSpace& state_space, const Simulator& simulator,
+                  State& state,
+                  std::vector<MeasurementResult>& measure_results) {
+    auto measure = [](unsigned, const StateSpace&, const State&) {};
+
+    return Run(param, circuit, state_space, simulator, state,
+               {}, measure, measure_results);
+  }
+
+  /**
+   * Runs the given circuit and make the final state available to the caller,
+   * discarding the result of any intermediate measurements in the circuit.
+   * @param param Options for gate fusion, parallelism and logging.
+   * @param circuit The circuit to be simulated.
+   * @param state_space StateSpace object required to perform measurements.
+   * @param simulator Simulator object. Provides specific implementations for
+   *   applying gates.
+   * @param state As an input parameter, this should contain the initial state
+   *   of the system. After a successful run, it will be populated with the
+   *   final state of the system.
+   * @return True if the simulation completed successfully; false otherwise.
+   */
+  template <typename Circuit>
+  static bool Run(const Parameter& param, const Circuit& circuit,
+                  const StateSpace& state_space, const Simulator& simulator,
+                  State& state) {
+    auto measure = [](unsigned, const StateSpace&, const State&) {};
+
+    std::vector<MeasurementResult> discarded_results;
+
+    return Run(param, circuit, state_space, simulator, state,
+               {}, measure, discarded_results);
+  }
+
+ private:
+  template <typename Circuit, typename MeasurementFunc>
+  static bool Run(const Parameter& param, const Circuit& circuit,
+                  const StateSpace& state_space, const Simulator& simulator,
+                  State& state,
+                  const std::vector<unsigned>& times_to_measure_at,
+                  MeasurementFunc measure,
+                  std::vector<MeasurementResult>& measure_results) {
+    double t0 = 0.0;
+
+    RGen rgen(param.seed);
+
+    custatevecExSVUpdaterDescriptor_t sv_updater = nullptr;
+    custatevecExDictionaryDescriptor_t sv_updater_config = nullptr;
+
+    ErrorCheck(custatevecExConfigureSVUpdater(
+        &sv_updater_config, StateSpace::kStateDataType, nullptr, 0));
+
+    ErrorCheck(
+        custatevecExSVUpdaterCreate(&sv_updater, sv_updater_config, nullptr));
+    ErrorCheck(custatevecExDictionaryDestroy(sv_updater_config));
+
+    if (param.verbosity > 0) {
+      t0 = GetTime();
+    }
+
+    unsigned cur_time_index = 0;
+
+    using Gates = detail::Gates<Circuit>;
+    const auto& gates = Gates::get(circuit);
+
+    for (std::size_t i = 0; i < gates.size(); ++i) {
+      const auto& gate = Gates::gate(gates[i]);
+      unsigned num_qubits = gate.qubits.size();
+      unsigned num_cqubits = gate.controlled_by.size();
+
+      if (gate.kind == gate::kMeasurement) {
+        ErrorCheck(
+            custatevecExSVUpdaterApply(sv_updater, state.get(), nullptr, 0));
+        ErrorCheck(custatevecExSVUpdaterClear(sv_updater));
+
+        auto measure_result = state_space.Measure(gate.qubits, rgen, state);
+        if (measure_result.valid) {
+          measure_results.push_back(std::move(measure_result));
+        } else {
+          IO::errorf("measurement failed.\n");
+          return false;
+        }
+      } else if (num_cqubits == 0) {
+        if (num_qubits == 0) {
+          ErrorCheck(
+            custatevecExSVUpdaterApply(sv_updater, state.get(), nullptr, 0));
+          ErrorCheck(custatevecExSVUpdaterClear(sv_updater));
+
+          simulator.ApplyGate(gate.qubits, gate.matrix.data(), state);
+        } else {
+          ErrorCheck(custatevecExSVUpdaterEnqueueMatrix(
+              sv_updater, gate.matrix.data(), StateSpace::kMatrixDataType,
+              StateSpace::kExMatrixType, StateSpace::kMatrixLayout, 0,
+              reinterpret_cast<const int32_t*>(gate.qubits.data()),
+              num_qubits, nullptr, nullptr, 0));
+        }
+      } else {
+        std::vector<int32_t> control_bits;
+        control_bits.reserve(num_cqubits);
+
+        for (std::size_t i = 0; i < num_cqubits; ++i) {
+          control_bits.push_back((gate.cmask >> i) & 1);
+        }
+
+        ErrorCheck(custatevecExSVUpdaterEnqueueMatrix(
+            sv_updater, gate.matrix.data(), StateSpace::kMatrixDataType,
+            StateSpace::kExMatrixType, StateSpace::kMatrixLayout, 0,
+            reinterpret_cast<const int32_t*>(gate.qubits.data()), num_qubits,
+            reinterpret_cast<const int32_t*>(gate.controlled_by.data()),
+            control_bits.data(), num_cqubits));
+      }
+
+      if (times_to_measure_at.size() > 0) {
+        unsigned t = times_to_measure_at[cur_time_index];
+
+        if (i == gates.size() - 1 || t < Gates::gate(gates[i + 1]).time) {
+          ErrorCheck(
+              custatevecExSVUpdaterApply(sv_updater, state.get(), nullptr, 0));
+          ErrorCheck(custatevecExSVUpdaterClear(sv_updater));
+
+          // Call back to perform measurements.
+          measure(cur_time_index, state_space, state);
+          ++cur_time_index;
+        }
+      }
+    }
+
+    ErrorCheck(custatevecExSVUpdaterApply(sv_updater, state.get(), nullptr, 0));
+
+    if (param.verbosity > 0) {
+      state_space.DeviceSync();
+      double t1 = GetTime();
+      IO::messagef("simu time is %g seconds.\n", t1 - t0);
+    }
+
+    ErrorCheck(custatevecExSVUpdaterDestroy(sv_updater));
+
+    return true;
+  }
+};
+
+}  // namespace qsim
+
+#endif  // RUN_CUSTATEVECEX_H_
diff --git a/lib/simulator_custatevec.h b/lib/simulator_custatevec.h
index b3f3cb8fa..a13c6e1af 100644
--- a/lib/simulator_custatevec.h
+++ b/lib/simulator_custatevec.h
@@ -82,8 +82,9 @@ class SimulatorCuStateVec final {
       ErrorCheck(custatevecApplyMatrix(
                      custatevec_handle_, state.get(), kStateType,
                      state.num_qubits(), matrix, kMatrixType, kMatrixLayout, 0,
-                     (int32_t*) qs.data(), qs.size(), nullptr, nullptr, 0,
-                     kComputeType, workspace_, workspace_size));
+                     reinterpret_cast<const int32_t*>(qs.data()), qs.size(),
+                     nullptr, nullptr, 0, kComputeType, workspace_,
+                     workspace_size));
     }
   }
 
@@ -118,9 +119,10 @@ class SimulatorCuStateVec final {
       ErrorCheck(custatevecApplyMatrix(
                      custatevec_handle_, state.get(), kStateType,
                      state.num_qubits(), matrix, kMatrixType, kMatrixLayout, 0,
-                     (int32_t*) qs.data(), qs.size(),
-                     (int32_t*) cqs.data(), control_bits.data(), cqs.size(),
-                     kComputeType, workspace_, workspace_size));
+                     reinterpret_cast<const int32_t*>(qs.data()), qs.size(),
+                     reinterpret_cast<const int32_t*>(cqs.data()),
+                     control_bits.data(), cqs.size(), kComputeType,
+                     workspace_, workspace_size));
     }
   }
 
@@ -144,9 +146,12 @@ class SimulatorCuStateVec final {
     ErrorCheck(custatevecComputeExpectation(
                    custatevec_handle_, state.get(), kStateType,
                    state.num_qubits(), &eval, kExpectType, nullptr, matrix,
-                   kMatrixType, kMatrixLayout, (int32_t*) qs.data(), qs.size(),
+                   kMatrixType, kMatrixLayout,
+                   reinterpret_cast<const int32_t*>(qs.data()), qs.size(),
                    kComputeType, workspace_, workspace_size));
 
+    ErrorCheck(cudaDeviceSynchronize());
+
     return {cuCreal(eval), cuCimag(eval)};
   }
 
diff --git a/lib/simulator_custatevecex.h b/lib/simulator_custatevecex.h
new file mode 100644
index 000000000..bcfb2c519
--- /dev/null
+++ b/lib/simulator_custatevecex.h
@@ -0,0 +1,243 @@
+// Copyright 2025 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef SIMULATOR_CUSTATEVECEX_H_
+#define SIMULATOR_CUSTATEVECEX_H_
+
+#include <complex>
+#include <cstdint>
+#include <type_traits>
+
+#include <cuComplex.h>
+#include <custatevecEx.h>
+
+#include "io.h"
+#include "statespace_custatevecex.h"
+#include "util_custatevec.h"
+#include "util_custatevecex.h"
+
+namespace qsim {
+
+/**
+ * Quantum circuit simulator using the NVIDIA cuStateVec library.
+ */
+template <typename FP = float>
+class SimulatorCuStateVecEx final {
+ public:
+  using StateSpace = StateSpaceCuStateVecEx<FP>;
+  using State = typename StateSpace::State;
+  using fp_type = typename StateSpace::fp_type;
+
+  static constexpr auto kStateDataType = StateSpace::kStateDataType;
+  static constexpr auto kMatrixDataType = StateSpace::kMatrixDataType;
+  static constexpr auto kExMatrixType = StateSpace::kExMatrixType;
+  static constexpr auto kMatrixLayout = StateSpace::kMatrixLayout;
+  static constexpr auto kExpectDataType = CUDA_C_64F;
+  static constexpr auto kComputeType =
+      StateSpace::is_float ? CUSTATEVEC_COMPUTE_32F : CUSTATEVEC_COMPUTE_64F;
+
+  SimulatorCuStateVecEx() {}
+
+  /**
+   * Applies a gate using the NVIDIA cuStateVec library.
+   * @param qs Indices of the qubits affected by this gate.
+   * @param matrix Matrix representation of the gate to be applied.
+   * @param state The state of the system, to be updated by this method.
+   */
+  void ApplyGate(const std::vector<unsigned>& qs,
+                 const fp_type* matrix, State& state) const {
+    if (qs.size() == 0) {
+      StateSpace::Multiply(matrix[0], matrix[1], state);
+    } else {
+      unsigned num_qubits = state.num_qubits();
+      unsigned num_global_qubits = get_num_global_qubits(state.num_substates());
+      unsigned num_local_qubits = num_qubits - num_global_qubits;
+
+      if (qs.size() > num_local_qubits) {
+        IO::errorf("error: the number of gate qubits exceeds the number of "
+                   "local qubits at %s %d.\n", __FILE__, __LINE__);
+        exit(1);
+      }
+
+      ErrorCheck(custatevecExApplyMatrix(
+          state.get(), matrix, kMatrixDataType, kExMatrixType, kMatrixLayout,
+          0, reinterpret_cast<const int32_t*>(qs.data()), qs.size(),
+          nullptr, nullptr, 0));
+    }
+  }
+
+  /**
+   * Applies a controlled gate using the NVIDIA cuStateVec library.
+   * @param qs Indices of the qubits affected by this gate.
+   * @param cqs Indices of control qubits.
+   * @param cmask Bit mask of control qubit values.
+   * @param matrix Matrix representation of the gate to be applied.
+   * @param state The state of the system, to be updated by this method.
+   */
+  void ApplyControlledGate(const std::vector<unsigned>& qs,
+                           const std::vector<unsigned>& cqs, uint64_t cmask,
+                           const fp_type* matrix, State& state) const {
+    if (qs.size() == 0) {
+      IO::errorf(
+          "error: controlled global phase gate is not implemented %s %d.\n",
+          __FILE__, __LINE__);
+      exit(1);
+    } else {
+      unsigned num_qubits = state.num_qubits();
+      unsigned num_global_qubits = get_num_global_qubits(state.num_substates());
+      unsigned num_local_qubits = num_qubits - num_global_qubits;
+
+      if (qs.size() > num_local_qubits) {
+        IO::errorf("error: the number of gate qubits exceeds the number of "
+                   "local qubits at %s %d.\n", __FILE__, __LINE__);
+        exit(1);
+      }
+
+      std::vector<int32_t> control_bits;
+      control_bits.reserve(cqs.size());
+
+      for (std::size_t i = 0; i < cqs.size(); ++i) {
+        control_bits.push_back((cmask >> i) & 1);
+      }
+
+      ErrorCheck(custatevecExApplyMatrix(
+          state.get(), matrix, kMatrixDataType, kExMatrixType, kMatrixLayout,
+          0, reinterpret_cast<const int32_t*>(qs.data()), qs.size(),
+          reinterpret_cast<const int32_t*>(cqs.data()), control_bits.data(),
+          cqs.size()));
+    }
+  }
+
+  /**
+   * Computes the expectation value of an operator using the NVIDIA cuStateVec
+   * library.
+   * @param qs Indices of the qubits the operator acts on.
+   * @param matrix The operator matrix.
+   * @param state The state of the system.
+   * @return The computed expectation value.
+   */
+  std::complex<double> ExpectationValue(const std::vector<unsigned>& qs,
+                                        const fp_type* matrix,
+                                        const State& state) const {
+    unsigned num_qubits = state.num_qubits();
+    unsigned num_global_qubits = get_num_global_qubits(state.num_substates());
+    unsigned num_local_qubits = num_qubits - num_global_qubits;
+
+    if (qs.size() > num_local_qubits) {
+      IO::errorf("error: the number of gate qubits exceeds the number of "
+                 "local qubits at %s %d.\n", __FILE__, __LINE__);
+      exit(1);
+    }
+
+    const auto& wire_ordering = state.get_wire_ordering();
+
+    // Wire ordering can be arbitrary. The following lines make qs consistent
+    // with wire ordering and permute bits if necessary.
+
+    std::vector<unsigned> perm;
+    perm.reserve(num_qubits);
+
+    for (unsigned i = 0; i < num_qubits; ++i) {
+      perm.push_back(i);
+    }
+
+    unsigned l = 0;
+    std::vector<unsigned> qs2(qs.size());
+
+    for (unsigned k = 0; k < qs.size(); ++k) {
+      for (unsigned i = 0; i < num_qubits; ++i) {
+        if (qs[k] == (unsigned) wire_ordering[i]) {
+          qs2[k] = i;
+          break;
+        }
+      }
+    }
+
+    for (unsigned k = 0; k < qs2.size(); ++k) {
+      if (qs2[k] >= num_local_qubits) {
+        unsigned j = 0;
+        while (j < qs2.size()) {
+          for (j = 0; j < qs2.size(); ++j) {
+            if (qs2[j] == l) {
+              ++l;
+
+              if (l == num_local_qubits) {
+                // We should not get here.
+                IO::errorf("error: internal error at %s %d.\n",
+                           __FILE__, __LINE__);
+                exit(1);
+              }
+
+              break;
+            }
+          }
+        }
+
+        std::swap(perm[qs2[k]], perm[l]);
+        qs2[k] = l++;
+      }
+    }
+
+    if (l > 0) {
+      ErrorCheck(custatevecExStateVectorPermuteIndexBits(
+          state.get(), reinterpret_cast<const int32_t*>(perm.data()),
+          num_qubits, CUSTATEVEC_EX_PERMUTATION_SCATTER));
+    }
+
+    auto f = [&matrix, &state, &num_local_qubits, &qs2](
+        unsigned i, const auto& r) {
+      void* workspace;
+      size_t workspace_size;
+
+      ErrorCheck(cudaSetDevice(r.device_id));
+
+      ErrorCheck(custatevecComputeExpectationGetWorkspaceSize(
+          r.custatevec_handle, kStateDataType, num_local_qubits, matrix,
+          kMatrixDataType, kMatrixLayout, qs2.size(), kComputeType,
+          &workspace_size));
+
+      // TODO: reuse allocated memory.
+      ErrorCheck(cudaMalloc(&workspace, workspace_size));
+
+      cuDoubleComplex eval;
+
+      ErrorCheck(custatevecComputeExpectation(
+          r.custatevec_handle, r.device_ptr, kStateDataType, num_local_qubits,
+          &eval, kExpectDataType, nullptr, matrix, kMatrixDataType,
+          kMatrixLayout, reinterpret_cast<const int32_t*>(qs2.data()),
+          qs2.size(), kComputeType, workspace, workspace_size));
+
+      // TODO: make it faster.
+      ErrorCheck(custatevecExStateVectorSynchronize(state.get()));
+      ErrorCheck(cudaFree(workspace));
+
+      return std::complex<double>{cuCreal(eval), cuCimag(eval)};
+    };
+
+    return state.reduce(f);
+  }
+
+  /**
+   * @return The size of SIMD register if applicable.
+   */
+  static unsigned SIMDRegisterSize() {
+    return 32;
+  }
+
+ private:
+};
+
+}  // namespace qsim
+
+#endif  // SIMULATOR_CUSTATEVECEX_H_
diff --git a/lib/statespace_custatevec.h b/lib/statespace_custatevec.h
index f2f5de107..6bd0a37d2 100644
--- a/lib/statespace_custatevec.h
+++ b/lib/statespace_custatevec.h
@@ -306,8 +306,10 @@ class StateSpaceCuStateVec :
 
     ErrorCheck(custatevecBatchMeasure(
                    custatevec_handle_, state.get(), kStateType,
-                   state.num_qubits(), (int*) result.bitstring.data(),
-                   (int*) qubits.data(), qubits.size(), r, collapse));
+                   state.num_qubits(),
+                   reinterpret_cast<int*>(result.bitstring.data()),
+                   reinterpret_cast<const int*>(qubits.data()), qubits.size(),
+                   r, collapse));
 
     for (std::size_t i = 0; i < result.bitstring.size(); ++i) {
       result.bits |= result.bitstring[i] << qubits[i];
diff --git a/lib/statespace_custatevecex.h b/lib/statespace_custatevecex.h
new file mode 100644
index 000000000..ce5cb0c3e
--- /dev/null
+++ b/lib/statespace_custatevecex.h
@@ -0,0 +1,431 @@
+// Copyright 2025 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef STATESPACE_CUSTATEVECEX_H_
+#define STATESPACE_CUSTATEVECEX_H_
+
+#include <cmath>
+#include <complex>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <cublas_v2.h>
+#include <cuComplex.h>
+#include <custatevecEx.h>
+
+#include "multiprocess_custatevecex.h"
+#include "statespace.h"
+#include "util_custatevec.h"
+#include "util_custatevecex.h"
+#include "vectorspace_custatevecex.h"
+
+namespace qsim {
+
+namespace detail {
+
+template <typename FP>
+__global__ void SetStateKernel(FP v, uint64_t size, void* state) {
+  uint64_t k = uint64_t{blockIdx.x} * blockDim.x + threadIdx.x;
+
+  if (k < size) {
+    ((FP*) state)[2 * k] = v;
+    ((FP*) state)[2 * k + 1] = 0;
+  }
+}
+
+}  // namespace detail
+
+/**
+ * Object containing context and routines for cuStateVec state-vector
+ * manipulations. It is not recommended to use `GetAmpl` and `SetAmpl`.
+ */
+template <typename FP = float>
+class StateSpaceCuStateVecEx :
+    public StateSpace<StateSpaceCuStateVecEx<FP>, VectorSpaceCuStateVecEx, FP> {
+ private:
+  using Base =
+      StateSpace<StateSpaceCuStateVecEx<FP>, VectorSpaceCuStateVecEx, FP>;
+
+ public:
+  using State = typename Base::State;
+  using fp_type = typename Base::fp_type;
+  using Parameter = typename Base::Parameter;
+
+  static constexpr auto kStateDataType = Base::kStateDataType;
+  static constexpr auto kMatrixDataType = kStateDataType;
+  static constexpr auto kExMatrixType = CUSTATEVEC_EX_MATRIX_DENSE;
+  static constexpr auto kMatrixLayout = CUSTATEVEC_MATRIX_LAYOUT_ROW;
+
+  explicit StateSpaceCuStateVecEx(const MultiProcessCuStateVecEx& mp,
+                                  Parameter param = Parameter{})
+      : Base(param, mp) {}
+
+  static uint64_t MinSize(unsigned num_qubits) {
+    return 2 * (uint64_t{1} << num_qubits);
+  };
+
+  void InternalToNormalOrder(State& state) const {
+    state.to_normal_order();
+  }
+
+  void NormalToInternalOrder(State& state) const {
+  }
+
+  void SetAllZeros(State& state) const {
+    uint64_t size = (uint64_t{1} << state.num_qubits()) / state.num_substates();
+
+    auto f = [&size](unsigned i, const auto& r) {
+      unsigned threads = size < 256 ? size : 256;
+      unsigned blocks = size / threads;
+      fp_type zero = 0.0;
+      detail::SetStateKernel<<<blocks, threads>>>(zero, size, r.device_ptr);
+    };
+
+    state.assign(f);
+  }
+
+  // Uniform superposition.
+  void SetStateUniform(State& state) const {
+    uint64_t size = uint64_t{1} << state.num_qubits();
+    fp_type v = double{1} / std::sqrt(size);
+    size /= state.num_substates();
+
+    auto f = [&size, &v](unsigned i, const auto& r) {
+      unsigned threads = size < 256 ? size : 256;
+      unsigned blocks = size / threads;
+      detail::SetStateKernel<<<blocks, threads>>>(v, size, r.device_ptr);
+    };
+
+    state.assign(f);
+  }
+
+  // |0> state.
+  void SetStateZero(State& state) const {
+    ErrorCheck((custatevecExStateVectorSetZeroState(state.get())));
+  }
+
+  // It is not recommended to use this function.
+  std::complex<fp_type> GetAmpl(const State& state, uint64_t i) const {
+    fp_type buf[2] = {0, 0};
+
+    uint64_t k = 0;
+    const auto& wire_ordering = state.get_wire_ordering();
+    for (unsigned j = 0; j < state.num_qubits(); ++j) {
+      k |= ((i >>  wire_ordering[j]) & 1) << j;
+    }
+
+    uint64_t size = (uint64_t{1} << state.num_qubits()) / state.num_substates();
+    unsigned required_rank = k / size;
+
+    if (state.distr_type() != Base::kMultiProcess
+        || Base::mp.rank() == required_rank) {
+      ErrorCheck(custatevecExStateVectorGetState(
+          state.get(), buf, kStateDataType, k, k + 1, 1));
+    }
+
+    ErrorCheck(custatevecExStateVectorSynchronize(state.get()));
+
+    if (state.distr_type() == Base::kMultiProcess) {
+      auto cuda_type = GetCudaType<std::complex<fp_type>>();
+      auto comm = Base::mp.communicator();
+      ErrorCheck(comm->intf->bcast(comm, buf, 1, cuda_type, required_rank));
+    }
+
+    return {buf[0], buf[1]};
+  }
+
+  // It is not recommended to use this function.
+  void SetAmpl(
+      State& state, uint64_t i, const std::complex<fp_type>& ampl) const {
+    fp_type buf[2] = {std::real(ampl), std::imag(ampl)};
+
+    uint64_t k = 0;
+    const auto& wire_ordering = state.get_wire_ordering();
+    for (unsigned j = 0; j < state.num_qubits(); ++j) {
+      k |= ((i >>  wire_ordering[j]) & 1) << j;
+    }
+
+    uint64_t size = (uint64_t{1} << state.num_qubits()) / state.num_substates();
+    unsigned required_rank = k / size;
+
+    if (state.distr_type() != Base::kMultiProcess
+        || Base::mp.rank() == required_rank) {
+      ErrorCheck(custatevecExStateVectorSetState(
+          state.get(), buf, kStateDataType, k, k + 1, 1));
+    }
+
+    ErrorCheck(custatevecExStateVectorSynchronize(state.get()));
+  }
+
+  // It is not recommended to use this function.
+  void SetAmpl(State& state, uint64_t i, fp_type re, fp_type im) const {
+    fp_type buf[2] = {re, im};
+
+    uint64_t k = 0;
+    const auto& wire_ordering = state.get_wire_ordering();
+    for (unsigned j = 0; j < state.num_qubits(); ++j) {
+      k |= ((i >>  wire_ordering[j]) & 1) << j;
+    }
+
+    uint64_t size = (uint64_t{1} << state.num_qubits()) / state.num_substates();
+    unsigned required_rank = k / size;
+
+    if (state.distr_type() != Base::kMultiProcess
+        || Base::mp.rank() == required_rank) {
+      ErrorCheck(custatevecExStateVectorSetState(
+          state.get(), buf, kStateDataType, k, k + 1, 1));
+    }
+
+    ErrorCheck(custatevecExStateVectorSynchronize(state.get()));
+  }
+
+  // Sets state[i] = complex(re, im) where (i & mask) == bits.
+  // if `exclude` is true then the criteria becomes (i & mask) != bits.
+  static void BulkSetAmpl(State& state, uint64_t mask, uint64_t bits,
+                          const std::complex<fp_type>& val,
+                          bool exclude = false) {
+    // Not implemented.
+  }
+
+  // Sets state[i] = complex(re, im) where (i & mask) == bits.
+  // if `exclude` is true then the criteria becomes (i & mask) != bits.
+  static void BulkSetAmpl(State& state, uint64_t mask, uint64_t bits, fp_type re,
+                          fp_type im, bool exclude = false) {
+    // Not implemented.
+  }
+
+  // Does the equivalent of dest += src elementwise.
+  bool Add(const State& src, State& dest) const {
+    if (src.num_qubits() != dest.num_qubits()) {
+      return false;
+    }
+
+    uint64_t size = (uint64_t{1} << src.num_qubits()) / src.num_substates();
+
+    auto f = [&size](unsigned i, const auto& rd, const auto& rs) {
+      cublasHandle_t cublas_handle;
+      ErrorCheck(cublasCreate(&cublas_handle));
+      ErrorCheck(cublasSetStream(cublas_handle, rd.stream));
+
+      if (Base::is_float) {
+        cuComplex a = {1.0, 0.0};
+        auto p1 = (const cuComplex*) rs.device_ptr;
+        auto p2 = (cuComplex*) rd.device_ptr;
+        ErrorCheck(cublasCaxpy(cublas_handle, size, &a, p1, 1, p2, 1));
+      } else {
+        cuDoubleComplex a = {1.0, 0.0};
+        auto p1 = (const cuDoubleComplex*) rs.device_ptr;
+        auto p2 = (cuDoubleComplex*) rd.device_ptr;
+        ErrorCheck(cublasZaxpy(cublas_handle, size, &a, p1, 1, p2, 1));
+      }
+
+      ErrorCheck(cudaStreamSynchronize(rd.stream));
+      ErrorCheck(cublasDestroy(cublas_handle));
+    };
+
+    dest.assign(src, f);
+
+    return true;
+  }
+
+  // Does the equivalent of state *= a elementwise.
+  static void Multiply(fp_type a, State& state) {
+    uint64_t size = (uint64_t{1} << state.num_qubits()) / state.num_substates();
+
+    auto f = [&a, &size](unsigned i, const auto& r) {
+      cublasHandle_t cublas_handle;
+      ErrorCheck(cublasCreate(&cublas_handle));
+      ErrorCheck(cublasSetStream(cublas_handle, r.stream));
+
+      if (Base::is_float) {
+        float a1 = a;
+        auto p = (cuComplex*) r.device_ptr;
+        ErrorCheck(cublasCsscal(cublas_handle, size, &a1, p, 1));
+      } else {
+        double a1 = a;
+        auto p = (cuDoubleComplex*) r.device_ptr;
+        ErrorCheck(cublasZdscal(cublas_handle, size, &a1, p, 1));
+      }
+
+      ErrorCheck(cudaStreamSynchronize(r.stream));
+      ErrorCheck(cublasDestroy(cublas_handle));
+    };
+
+    return state.assign(f);
+  }
+
+  // Does the equivalent of state *= (re + i im) elementwise.
+  static void Multiply(fp_type re, fp_type im, State& state) {
+    uint64_t size = (uint64_t{1} << state.num_qubits()) / state.num_substates();
+
+    auto f = [&re, &im, &size](unsigned i, const auto& r) {
+      cublasHandle_t cublas_handle;
+      ErrorCheck(cublasCreate(&cublas_handle));
+      ErrorCheck(cublasSetStream(cublas_handle, r.stream));
+
+      if (Base::is_float) {
+        cuComplex a = {float(re), float(im)};
+        auto p = (cuComplex*) r.device_ptr;
+        ErrorCheck(cublasCscal(cublas_handle, size, &a, p, 1));
+      } else {
+        cuDoubleComplex a = {re, im};
+        auto p = (cuDoubleComplex*) r.device_ptr;
+        ErrorCheck(cublasZscal(cublas_handle, size, &a, p, 1));
+      }
+
+      ErrorCheck(cudaStreamSynchronize(r.stream));
+      ErrorCheck(cublasDestroy(cublas_handle));
+    };
+
+    return state.assign(f);
+  }
+
+  static std::complex<double> InnerProduct(
+      const State& state1, const State& state2) {
+    if (state1.num_qubits() != state2.num_qubits()) {
+      return std::nan("");
+    }
+
+    uint64_t size =
+        (uint64_t{1} << state1.num_qubits()) / state1.num_substates();
+
+    auto f = [&size](unsigned i, const auto& r1, const auto& r2) {
+      cublasHandle_t cublas_handle;
+      ErrorCheck(cublasCreate(&cublas_handle));
+      ErrorCheck(cublasSetStream(cublas_handle, r1.stream));
+
+      if (Base::is_float) {
+        cuComplex result;
+        auto p1 = (const cuComplex*) r1.device_ptr;
+        auto p2 = (const cuComplex*) r2.device_ptr;
+        ErrorCheck(cublasCdotc(cublas_handle, size, p1, 1, p2, 1, &result));
+        return std::complex<double>{cuCrealf(result), cuCimagf(result)};
+      } else {
+        cuDoubleComplex result;
+        auto p1 = (const cuDoubleComplex*) r1.device_ptr;
+        auto p2 = (const cuDoubleComplex*) r2.device_ptr;
+        ErrorCheck(cublasZdotc(cublas_handle, size, p1, 1, p2, 1, &result));
+        return std::complex<double>{cuCreal(result), cuCimag(result)};
+      }
+
+      ErrorCheck(cudaStreamSynchronize(r1.stream));
+      ErrorCheck(cublasDestroy(cublas_handle));
+    };
+
+    return state1.reduce(state2, f);
+  }
+
+  double RealInnerProduct(const State& state1, const State& state2) const {
+    return std::real(InnerProduct(state1, state2));
+  }
+
+  double Norm(const State& state) const {
+    double norm;
+
+    ErrorCheck(custatevecExAbs2SumArray(
+        state.get(), &norm, nullptr, 0, nullptr, nullptr, 0));
+    ErrorCheck(custatevecExStateVectorSynchronize(state.get()));
+
+    return norm;
+  }
+
+  template <typename DistrRealType = double>
+  std::vector<uint64_t> Sample(
+      const State& state, uint64_t num_samples, unsigned seed) const {
+    std::vector<uint64_t> bitstrings;
+
+    if (num_samples > 0) {
+      auto rs = GenerateRandomValues<double>(num_samples, seed, 1.0);
+
+      std::vector<custatevecIndex_t> bitstrings0(num_samples);
+
+      std::vector<int32_t> wires;
+      wires.reserve(state.num_qubits());
+      for (unsigned i = 0; i < state.num_qubits(); ++i) {
+        wires[i] = i;
+      }
+
+      ErrorCheck(custatevecExSample(
+          state.get(), bitstrings0.data(), wires.data(), state.num_qubits(),
+          rs.data(), num_samples, CUSTATEVEC_SAMPLER_OUTPUT_RANDNUM_ORDER,
+          nullptr));
+      ErrorCheck(custatevecExStateVectorSynchronize(state.get()));
+
+      bitstrings.reserve(num_samples);
+      for (unsigned i = 0; i < num_samples; ++i) {
+        bitstrings.push_back(bitstrings0[i]);
+      }
+    }
+
+    return bitstrings;
+  }
+
+  using MeasurementResult = typename Base::MeasurementResult;
+
+  template <typename RGen>
+  MeasurementResult Measure(const std::vector<unsigned>& qubits,
+                            RGen& rgen, State& state,
+                            bool no_collapse = false) const {
+    auto r = RandomValue(rgen, 1.0);
+
+    MeasurementResult result;
+
+    result.valid = true;
+    result.mask = 0;
+    result.bits = 0;
+    result.bitstring.resize(qubits.size(), 0);
+
+    for (auto q : qubits) {
+      if (q >= state.num_qubits()) {
+        result.valid = false;
+        return result;
+      }
+
+      result.mask |= uint64_t{1} << q;
+    }
+
+    auto collapse = no_collapse ?
+        CUSTATEVEC_COLLAPSE_NONE : CUSTATEVEC_COLLAPSE_NORMALIZE_AND_ZERO;
+
+    custatevecIndex_t bits;
+
+    ErrorCheck(custatevecExMeasure(
+        state.get(), &bits, reinterpret_cast<const int32_t*>(qubits.data()),
+        qubits.size(), r, collapse, nullptr));
+    ErrorCheck(custatevecExStateVectorSynchronize(state.get()));
+
+    for (std::size_t i = 0; i < qubits.size(); ++i) {
+      uint64_t bit = (bits >> i) & 1;
+      result.bitstring[i] = bit;
+      result.bits |= bit << qubits[i];
+    }
+
+    return result;
+  }
+
+  template <typename RGen>
+  MeasurementResult VirtualMeasure(const std::vector<unsigned>& qubits,
+                                   RGen& rgen, const State& state) const {
+    return Measure(qubits, rgen, const_cast<State&>(state), true);
+  }
+
+  void Collapse(const MeasurementResult& mr, State& state) const {
+    // Not implemented.
+  }
+};
+
+}  // namespace qsim
+
+#endif  // STATESPACE_CUSTATEVECEX_H_
diff --git a/lib/util_cuda.h b/lib/util_cuda.h
index 5d8cb5df3..b34292753 100644
--- a/lib/util_cuda.h
+++ b/lib/util_cuda.h
@@ -22,6 +22,7 @@
 #endif
 
 #include <cstdlib>
+#include <type_traits>
 
 #include "io.h"
 
@@ -31,11 +32,27 @@ namespace qsim {
 
 inline void ErrorAssert(cudaError_t code, const char* file, unsigned line) {
   if (code != cudaSuccess) {
-    IO::errorf("CUDA error: %s %s %d\n", cudaGetErrorString(code), file, line);
+    IO::errorf(
+        "CUDA error: %s at %s %d\n", cudaGetErrorString(code), file, line);
     exit(code);
   }
 }
 
+template <typename T>
+inline auto GetCudaType() {
+  if (std::is_same_v<T, float>) {
+    return CUDA_R_32F;
+  } else if (std::is_same_v<T, double>) {
+    return CUDA_R_64F;
+  } else if (std::is_same_v<T, std::complex<float>>) {
+    return CUDA_C_32F;
+  } else if (std::is_same_v<T, std::complex<double>>) {
+    return CUDA_C_64F;
+  }
+
+  return CUDA_C_64F;
+}
+
 template <typename T>
 struct Complex {
   __host__ __device__ __forceinline__ Complex() {}
diff --git a/lib/util_custatevec.h b/lib/util_custatevec.h
index 36f29efab..d37858b40 100644
--- a/lib/util_custatevec.h
+++ b/lib/util_custatevec.h
@@ -25,7 +25,7 @@ namespace qsim {
 
 inline void ErrorAssert(cublasStatus_t code, const char* file, unsigned line) {
   if (code != CUBLAS_STATUS_SUCCESS) {
-    IO::errorf("cuBLAS error %i: %s %d\n", code, file, line);
+    IO::errorf("cuBLAS error %d at %s %d\n", code, file, line);
     exit(code);
   }
 }
@@ -33,7 +33,7 @@ inline void ErrorAssert(cublasStatus_t code, const char* file, unsigned line) {
 inline void ErrorAssert(
     custatevecStatus_t code, const char* file, unsigned line) {
   if (code != CUSTATEVEC_STATUS_SUCCESS) {
-    IO::errorf("custatevec error: %s %s %d\n",
+    IO::errorf("cuStateVec error: %s at %s %d\n",
                 custatevecGetErrorString(code), file, line);
     exit(code);
   }
diff --git a/lib/util_custatevecex.h b/lib/util_custatevecex.h
new file mode 100644
index 000000000..ab57d7e85
--- /dev/null
+++ b/lib/util_custatevecex.h
@@ -0,0 +1,46 @@
+// Copyright 2025 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef UTIL_CUSTATEVECEX_H_
+#define UTIL_CUSTATEVECEX_H_
+
+#include <custatevecEx.h>
+#include <custatevecEx_ext.h>
+
+#include "io.h"
+#include "util_cuda.h"
+
+namespace qsim {
+
+inline void ErrorAssert(
+    custatevecExCommunicatorStatus_t code, const char* file, unsigned line) {
+  if (code != CUSTATEVEC_EX_COMMUNICATOR_STATUS_SUCCESS) {
+    IO::errorf(
+        "cuStateVecEx communicator error %d at %s %d\n", code, file, line);
+    exit(code);
+  }
+}
+
+inline unsigned get_num_global_qubits(unsigned num_devices) {
+  unsigned num_global_qubits = 0;
+  while ((num_devices >>= 1) > 0) {
+    ++num_global_qubits;
+  }
+
+  return num_global_qubits;
+}
+
+}  // namespace qsim
+
+#endif  // UTIL_CUSTATEVECEX_H_
diff --git a/lib/vectorspace_custatevecex.h b/lib/vectorspace_custatevecex.h
new file mode 100644
index 000000000..3fa26a931
--- /dev/null
+++ b/lib/vectorspace_custatevecex.h
@@ -0,0 +1,610 @@
+// Copyright 2025 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef VECTORSPACE_CUSTATEVECEX_H_
+#define VECTORSPACE_CUSTATEVECEX_H_
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <custatevecEx.h>
+
+#include <algorithm>
+#include <complex>
+#include <type_traits>
+#include <vector>
+
+#include "io.h"
+#include "multiprocess_custatevecex.h"
+#include "util_cuda.h"
+#include "util_custatevec.h"
+#include "util_custatevecex.h"
+
+namespace qsim {
+
+namespace detail {
+
+inline void free(void* ptr) {}
+
+}  // namespace detail
+
+// Routines for vector manipulations.
+template <typename Impl, typename FP>
+class VectorSpaceCuStateVecEx {
+ public:
+  using fp_type = FP;
+
+  static constexpr auto is_float = std::is_same<fp_type, float>::value;
+  static constexpr auto kStateDataType = is_float ? CUDA_C_32F : CUDA_C_64F;
+
+  enum DistributionType {
+    kNoDistr,
+    kSingleDevice,
+    kMultiDevice,
+    kMultiProcess,
+  };
+
+  enum DeviceNetworkType {
+    kSwitch = 0,
+    kFullMesh = 1,
+  };
+
+  struct Parameter {
+    unsigned num_devices = 0;
+    DeviceNetworkType device_network_type = kSwitch;
+    unsigned verbosity = 0;
+  };
+
+  class Vector {
+   public:
+    struct CuStateVecResources {
+      int32_t device_id = -1;
+      void* device_ptr = nullptr;
+      cudaStream_t stream = nullptr;
+      custatevecHandle_t custatevec_handle = nullptr;
+    };
+
+    Vector(const Vector&) = delete;
+    Vector& operator=(const Vector&) = delete;
+
+    Vector() : mp_(nullptr), ptr_(nullptr),
+        num_qubits_(0), num_substates_(0), distr_type_(kNoDistr) {}
+
+    Vector(const MultiProcessCuStateVecEx* mp,
+           custatevecExStateVectorDescriptor_t ptr, unsigned num_qubits,
+           unsigned num_substates, DistributionType distr_type)
+        : mp_(mp), ptr_(ptr), wire_ordering_(num_qubits),
+          num_qubits_(num_qubits), num_substates_(num_substates),
+          distr_type_(distr_type) {}
+
+    Vector(Vector&& r) : mp_(r.mp_), ptr_(r.ptr_),
+        wire_ordering_(std::move(r.wire_ordering_)),
+        num_qubits_(r.num_qubits_), num_substates_(r.num_substates_),
+        distr_type_(r.distr_type_) {
+      r.mp_ = nullptr;
+      r.ptr_ = nullptr;
+      r.num_qubits_ = 0;
+      r.num_substates_ = 0;
+      r.distr_type_ = kNoDistr;
+    }
+
+    ~Vector() {
+      if (ptr_ != nullptr) {
+        ErrorCheck(custatevecExStateVectorDestroy(ptr_));
+      }
+    }
+
+    Vector& operator=(Vector&& r) {
+      if (this != &r) {
+        mp_ = r.mp_;
+        ptr_ = r.ptr_;
+        wire_ordering_ = std::move(r.wire_ordering_);
+        num_qubits_ = r.num_qubits_;
+        num_substates_ = r.num_substates_;
+        distr_type_ = r.distr_type_;
+
+        r.mp_ = nullptr;
+        r.ptr_ = nullptr;
+        r.num_qubits_ = 0;
+        r.num_substates_ = 0;
+        r.distr_type_ = kNoDistr;
+      }
+
+      return *this;
+    }
+
+    auto get() {
+      return ptr_;
+    }
+
+    const auto get() const {
+      return ptr_;
+    }
+
+    custatevecExStateVectorDescriptor_t release() {
+      auto ptr = ptr_;
+
+      mp_ = nullptr;
+      ptr_ = nullptr;
+      num_qubits_ = 0;
+      num_substates_ = 0;
+      distr_type_ = kNoDistr;
+
+      return ptr;
+    }
+
+    unsigned num_qubits() const {
+      return num_qubits_;
+    }
+
+    unsigned num_substates() const {
+      return num_substates_;
+    }
+
+    DistributionType distr_type() const {
+      return distr_type_;
+    }
+
+    static constexpr bool requires_copy_to_host() {
+      return true;
+    }
+
+    const auto& get_wire_ordering() const {
+      ErrorCheck(custatevecExStateVectorGetProperty(
+          ptr_, CUSTATEVEC_EX_SV_PROP_WIRE_ORDERING,
+          wire_ordering_.data(), sizeof(int32_t) * num_qubits_));
+
+      return wire_ordering_;
+    }
+
+    void to_normal_order() const {
+      const auto& wire_ordering = get_wire_ordering();
+
+      ErrorCheck(custatevecExStateVectorPermuteIndexBits(
+          ptr_, wire_ordering.data(), num_qubits_,
+          CUSTATEVEC_EX_PERMUTATION_SCATTER));
+    }
+
+    CuStateVecResources get_resources(unsigned substate_index) const {
+      CuStateVecResources r;
+
+      ErrorCheck(custatevecExStateVectorGetResourcesFromDeviceSubSV(
+          ptr_, substate_index, &r.device_id, &r.device_ptr, &r.stream,
+          &r.custatevec_handle));
+
+      return r;
+    }
+
+    template <typename Callback>
+    void assign(Callback&& callback) const {
+      if (distr_type_ == kMultiProcess) {
+        unsigned num_devices = 1;
+        std::vector<int32_t> substate_indices(num_devices);
+
+        ErrorCheck(custatevecExStateVectorGetProperty(
+            ptr_, CUSTATEVEC_EX_SV_PROP_DEVICE_SUBSV_INDICES,
+            substate_indices.data(), num_devices * sizeof(int32_t)));
+
+        unsigned k = substate_indices[0];
+        auto res = get_resources(k);
+
+        ErrorCheck(cudaSetDevice(res.device_id));
+
+        callback(k, res);
+      } else {
+        if (num_substates_ == 1) {
+          callback(0, get_resources(0));
+        } else {
+          std::vector<int32_t> substate_indices(num_substates_);
+          ErrorCheck(custatevecExStateVectorGetProperty(
+              ptr_, CUSTATEVEC_EX_SV_PROP_DEVICE_SUBSV_INDICES,
+              substate_indices.data(), num_substates_ * sizeof(int32_t)));
+
+          for (unsigned i = 0; i < num_substates_; ++i) {
+            unsigned k = substate_indices[i];
+            auto res = get_resources(k);
+
+            ErrorCheck(cudaSetDevice(res.device_id));
+
+            callback(k, res);
+          }
+        }
+      }
+    }
+
+    template <typename Callback>
+    auto reduce(Callback&& callback) const {
+      using ResultType = std::invoke_result_t<Callback, unsigned,
+                                              CuStateVecResources>;
+
+      if (distr_type_ == kMultiProcess) {
+        unsigned num_devices = 1;
+        std::vector<int32_t> substate_indices(num_devices);
+
+        ErrorCheck(custatevecExStateVectorGetProperty(
+            ptr_, CUSTATEVEC_EX_SV_PROP_DEVICE_SUBSV_INDICES,
+            substate_indices.data(), num_devices * sizeof(int32_t)));
+
+        unsigned k = substate_indices[0];
+        auto res = get_resources(k);
+
+        ErrorCheck(cudaSetDevice(res.device_id));
+
+        ResultType r;
+        ResultType local_r = callback(k, res);
+
+        auto cuda_type = GetCudaType<ResultType>();
+        auto comm = mp_->communicator();
+        ErrorCheck(comm->intf->allreduce(comm, &local_r, &r, 1, cuda_type));
+
+        return r;
+      } else {
+        if (num_substates_ == 1) {
+          return callback(0, get_resources(0));
+        } else {
+          std::vector<int32_t> substate_indices(num_substates_);
+          ErrorCheck(custatevecExStateVectorGetProperty(
+              ptr_, CUSTATEVEC_EX_SV_PROP_DEVICE_SUBSV_INDICES,
+              substate_indices.data(), num_substates_ * sizeof(int32_t)));
+
+          ResultType r = 0;
+
+          for (unsigned i = 0; i < num_substates_; ++i) {
+            unsigned k = substate_indices[i];
+            auto res = get_resources(k);
+
+            ErrorCheck(cudaSetDevice(res.device_id));
+
+            r += callback(k, res);
+          }
+
+          return r;
+        }
+      }
+    }
+
+    template <typename Callback>
+    void assign(const Vector& vec, Callback&& callback) const {
+      if (distr_type_ == kMultiProcess) {
+        unsigned num_devices = 1;
+        std::vector<int32_t> substate_indices(num_devices);
+
+        ErrorCheck(custatevecExStateVectorGetProperty(
+            ptr_, CUSTATEVEC_EX_SV_PROP_DEVICE_SUBSV_INDICES,
+            substate_indices.data(), num_devices * sizeof(int32_t)));
+
+        unsigned k = substate_indices[0];
+        auto res1 = get_resources(k);
+        auto res2 = vec.get_resources(k);
+
+        ErrorCheck(cudaSetDevice(res1.device_id));
+
+        callback(k, res1, res2);
+      } else {
+        if (num_substates_ == 1) {
+          callback(0, get_resources(0), vec.get_resources(0));
+        } else {
+          std::vector<int32_t> substate_indices(num_substates_);
+          ErrorCheck(custatevecExStateVectorGetProperty(
+              ptr_, CUSTATEVEC_EX_SV_PROP_DEVICE_SUBSV_INDICES,
+              substate_indices.data(), num_substates_ * sizeof(int32_t)));
+
+          for (unsigned i = 0; i < num_substates_; ++i) {
+            unsigned k = substate_indices[i];
+            auto res1 = get_resources(k);
+            auto res2 = vec.get_resources(k);
+
+            ErrorCheck(cudaSetDevice(res1.device_id));
+
+            callback(k, res1, res2);
+          }
+        }
+      }
+    }
+
+    template <typename Callback>
+    auto reduce(const Vector& vec, Callback&& callback) const {
+      using ResultType = std::invoke_result_t<Callback, unsigned,
+                                              CuStateVecResources,
+                                              CuStateVecResources>;
+
+      if (distr_type_ == kMultiProcess) {
+        unsigned num_devices = 1;
+        std::vector<int32_t> substate_indices(num_devices);
+
+        ErrorCheck(custatevecExStateVectorGetProperty(
+            ptr_, CUSTATEVEC_EX_SV_PROP_DEVICE_SUBSV_INDICES,
+            substate_indices.data(), num_devices * sizeof(int32_t)));
+
+        unsigned k = substate_indices[0];
+        auto res1 = get_resources(k);
+        auto res2 = vec.get_resources(k);
+
+        ErrorCheck(cudaSetDevice(res2.device_id));
+        ErrorCheck(cudaStreamSynchronize(res2.stream));
+
+        ResultType r;
+        ResultType local_r = callback(k, res1, res2);
+
+        auto cuda_type = GetCudaType<ResultType>();
+        auto comm = mp_->communicator();
+        ErrorCheck(comm->intf->allreduce(comm, &local_r, &r, 1, cuda_type));
+
+        return r;
+      } else {
+        if (num_substates_ == 1) {
+          return callback(0, get_resources(0), vec.get_resources(0));
+        } else {
+          std::vector<int32_t> substate_indices(num_substates_);
+          ErrorCheck(custatevecExStateVectorGetProperty(
+              ptr_, CUSTATEVEC_EX_SV_PROP_DEVICE_SUBSV_INDICES,
+              substate_indices.data(), num_substates_ * sizeof(int32_t)));
+
+          ResultType r = 0;
+
+          for (unsigned i = 0; i < num_substates_; ++i) {
+            unsigned k = substate_indices[i];
+            auto res1 = get_resources(k);
+            auto res2 = vec.get_resources(k);
+
+            ErrorCheck(cudaSetDevice(res2.device_id));
+            ErrorCheck(cudaStreamSynchronize(res2.stream));
+
+            r += callback(k, res1, res2);
+          }
+
+          return r;
+        }
+      }
+    }
+
+   private:
+    const MultiProcessCuStateVecEx* mp_;
+    custatevecExStateVectorDescriptor_t ptr_;
+    mutable std::vector<int32_t> wire_ordering_;
+    unsigned num_qubits_;
+    unsigned num_substates_;
+    DistributionType distr_type_;
+  };
+
+  VectorSpaceCuStateVecEx(const Parameter& param,
+                          const MultiProcessCuStateVecEx& mp)
+      : param(param), mp(mp) {}
+
+  Vector Create(unsigned num_qubits) const {
+    custatevecExStateVectorDescriptor_t state_vec;
+    custatevecExDictionaryDescriptor_t sv_config
+        = mp.create_sv_config(num_qubits, kStateDataType);
+
+    unsigned num_substates = 1;
+    DistributionType distr_type = kNoDistr;
+
+    if (sv_config != nullptr) {
+      ErrorCheck(custatevecExStateVectorCreateMultiProcess(
+          &state_vec, sv_config, nullptr, mp.communicator(), nullptr));
+
+      num_substates = mp.num_processes();
+      distr_type = kMultiProcess;
+
+      if (param.verbosity > 2) {
+        unsigned num_global_qubits = get_num_global_qubits(num_substates);
+        IO::messagef("multi-process mode: %u %u.\n",
+                     num_qubits, num_global_qubits);
+      }
+    } else {
+      num_substates = param.num_devices;
+
+      if (num_qubits < 3) {
+        num_substates = 1;
+      } else if (num_substates == 0) {
+        int count = 0;
+        ErrorCheck(cudaGetDeviceCount(&count));
+        num_substates = count;
+      }
+
+      if (num_substates == 1) {
+        ErrorCheck(custatevecExConfigureStateVectorSingleDevice(
+            &sv_config, kStateDataType, num_qubits, num_qubits, -1, 0));
+
+        distr_type = kSingleDevice;
+
+        if (param.verbosity > 2) {
+          IO::messagef("single device mode.\n");
+        }
+      } else {
+        unsigned num_global_qubits = get_num_global_qubits(num_substates);
+
+        while (num_global_qubits + 2 > num_qubits && num_substates > 1) {
+          num_substates /= 2;
+          --num_global_qubits;
+        }
+
+        if (num_substates == 1) {
+          ErrorCheck(custatevecExConfigureStateVectorSingleDevice(
+              &sv_config, kStateDataType, num_qubits, num_qubits, -1, 0));
+
+          distr_type = kSingleDevice;
+
+          if (param.verbosity > 2) {
+            IO::messagef("single-device mode (too few qubits).\n");
+          }
+        } else {
+          std::vector<int32_t> device_ids(num_substates);
+          for (unsigned i = 0; i < num_substates; ++i) {
+            device_ids[i] = i;
+          }
+
+          unsigned num_local_qubits = num_qubits - num_global_qubits;
+
+          auto device_network_type =
+            get_device_network_type(param.device_network_type);
+
+          ErrorCheck(custatevecExConfigureStateVectorMultiDevice(
+              &sv_config, kStateDataType, num_qubits, num_local_qubits,
+              device_ids.data(), num_substates, device_network_type, 0));
+
+          distr_type = kMultiDevice;
+
+          if (param.verbosity > 2) {
+            IO::messagef("multi-device mode: %u %u.\n",
+                         num_qubits, num_global_qubits);
+          }
+        }
+      }
+
+      ErrorCheck(custatevecExStateVectorCreateSingleProcess(
+        &state_vec, sv_config, nullptr, 0, nullptr));
+    }
+
+    ErrorCheck(custatevecExDictionaryDestroy(sv_config));
+
+    return Vector{&mp, state_vec, num_qubits, num_substates, distr_type};
+  }
+
+  static Vector Null() {
+    return Vector{nullptr, nullptr, 0, 0, kNoDistr};
+  }
+
+  static bool IsNull(const Vector& vector) {
+    return vector.get() == nullptr;
+  }
+
+  bool Copy(const Vector& src, Vector& dest) const {
+    if (src.num_qubits() != dest.num_qubits()) {
+      return false;
+    }
+
+    uint64_t size = (uint64_t{1} << src.num_qubits()) / src.num_substates();
+
+    auto f = [&size](unsigned i, const auto& rd, const auto& rs) {
+      ErrorCheck(cudaMemcpy(
+          rd.device_ptr, rs.device_ptr, 2 * sizeof(fp_type) * size,
+          cudaMemcpyDeviceToDevice));
+    };
+
+    dest.assign(src, f);
+
+    return true;
+  }
+
+  // It is the client's responsibility to make sure that dest has at least
+  // 2^src.num_qubits() elements.
+  bool Copy(const Vector& src, fp_type* dest) const {
+    if (src.distr_type() == kMultiProcess) {
+      uint64_t size = (uint64_t{1} << src.num_qubits()) / src.num_substates();
+      uint64_t offset = size * mp.rank();
+
+      ErrorCheck(custatevecExStateVectorGetState(
+          src.get(), dest + 2 * offset, kStateDataType,
+          offset, offset + size, 1));
+      ErrorCheck(custatevecExStateVectorSynchronize(src.get()));
+
+      auto cuda_type = GetCudaType<std::complex<fp_type>>();
+      auto comm = mp.communicator();
+      ErrorCheck(comm->intf->allgather(
+          comm, dest + 2 * offset, dest, size, cuda_type));
+    } else {
+      uint64_t size = uint64_t{1} << src.num_qubits();
+      ErrorCheck(custatevecExStateVectorGetState(
+          src.get(), dest, kStateDataType, 0, size, 1));
+      ErrorCheck(custatevecExStateVectorSynchronize(src.get()));
+    }
+
+    return true;
+  }
+
+  // It is the client's responsibility to make sure that src has at least
+  // 2^dest.num_qubits() elements.
+  bool Copy(const fp_type* src, Vector& dest) const {
+    if (dest.distr_type() == kMultiProcess) {
+      uint64_t size = (uint64_t{1} << dest.num_qubits()) / dest.num_substates();
+      uint64_t offset = size * mp.rank();
+
+      ErrorCheck(custatevecExStateVectorSetState(
+          dest.get(), src + 2 * offset, kStateDataType,
+          offset, offset + size, 1));
+    } else {
+      uint64_t size = uint64_t{1} << dest.num_qubits();
+      ErrorCheck(custatevecExStateVectorSetState(
+          dest.get(), src, kStateDataType, 0, size, 1));
+    }
+
+    ErrorCheck(custatevecExStateVectorSynchronize(dest.get()));
+
+    // TODO: do we need that?
+    dest.to_normal_order();
+
+    return true;
+  }
+
+  // It is the client's responsibility to make sure that src has at least
+  // 2^dest.num_qubits() elements.
+  bool Copy(const fp_type* src, uint64_t size, Vector& dest) const {
+    size = size / 2;
+
+    if (size != (uint64_t{1} << dest.num_qubits())) {
+      IO::errorf("wrong size in VectorSpaceCuStateVecEx::Copy.\n");
+      return false;
+    }
+
+    if (dest.distr_type() == kMultiProcess) {
+      size /= dest.num_substates();
+      uint64_t offset = size * mp.rank();
+
+      ErrorCheck(custatevecExStateVectorSetState(
+          dest.get(), src + 2 * offset, kStateDataType,
+          offset, offset + size, 1));
+    } else {
+      ErrorCheck(custatevecExStateVectorSetState(
+          dest.get(), src, kStateDataType, 0, size, 1));
+    }
+
+    ErrorCheck(custatevecExStateVectorSynchronize(dest.get()));
+
+    // TODO: do we need that?
+    dest.to_normal_order();
+
+    return true;
+  }
+
+  static void DeviceSync() {
+    ErrorCheck(cudaDeviceSynchronize());
+  }
+
+ protected:
+  Parameter param;
+  const MultiProcessCuStateVecEx& mp;
+
+ private:
+  static custatevecDeviceNetworkType_t get_device_network_type(
+      DeviceNetworkType id) {
+    custatevecDeviceNetworkType_t device_network_type =
+        CUSTATEVEC_DEVICE_NETWORK_TYPE_SWITCH;
+
+    switch (id) {
+    case kSwitch:
+      device_network_type = CUSTATEVEC_DEVICE_NETWORK_TYPE_SWITCH;
+      break;
+    case kFullMesh:
+      device_network_type = CUSTATEVEC_DEVICE_NETWORK_TYPE_FULLMESH;
+      break;
+    }
+
+    return device_network_type;
+  }
+};
+
+}  // namespace qsim
+
+#endif  // VECTORSPACE_CUSTATEVECEX_H_
diff --git a/pybind_interface/Makefile b/pybind_interface/Makefile
index 9bcc54b63..f9693c82c 100644
--- a/pybind_interface/Makefile
+++ b/pybind_interface/Makefile
@@ -21,6 +21,7 @@ QSIMLIB_AVX2 = ../qsimcirq/qsim_avx2$(SUFFIX)
 QSIMLIB_AVX512 = ../qsimcirq/qsim_avx512$(SUFFIX)
 QSIMLIB_CUDA = ../qsimcirq/qsim_cuda$(SUFFIX)
 QSIMLIB_CUSTATEVEC = ../qsimcirq/qsim_custatevec$(SUFFIX)
+QSIMLIB_CUSTATEVECEX = ../qsimcirq/qsim_custatevecex$(SUFFIX)
 QSIMLIB_HIP = ../qsimcirq/qsim_hip$(SUFFIX)
 QSIMLIB_DECIDE = ../qsimcirq/qsim_decide$(SUFFIX)
 
@@ -66,7 +67,7 @@ else
 ifeq ($(CUQUANTUM_ROOT),)
 pybind: pybind-cpu pybind-cuda decide-cuda
 else
-pybind: pybind-cpu pybind-cuda pybind-custatevec decide-custatevec
+pybind: pybind-cpu pybind-cuda pybind-custatevec pybind-custatevecex decide-custatevec
 endif
 endif
 
@@ -94,9 +95,13 @@ decide-cuda:
 pybind-custatevec:
 	$(NVCC) custatevec/pybind_main_custatevec.cpp -o $(QSIMLIB_CUSTATEVEC) $(NVCCFLAGS) $(PYBINDFLAGS_CUSTATEVEC)
 
+.PHONY: pybind-custatevecex
+pybind-custatevecex:
+	$(NVCC) custatevecex/pybind_main_custatevecex.cpp -o $(QSIMLIB_CUSTATEVECEX) $(NVCCFLAGS) $(PYBINDFLAGS_CUSTATEVEC)
+
 .PHONY: decide-custatevec
 decide-custatevec:
-	$(NVCC) decide/decide.cpp -D__CUSTATEVEC__ -o $(QSIMLIB_DECIDE) $(NVCCFLAGS) $(PYBINDFLAGS_CUDA)
+	$(NVCC) decide/decide.cpp -D__CUSTATEVEC__ -D__CUSTATEVECEX__ -o $(QSIMLIB_DECIDE) $(NVCCFLAGS) $(PYBINDFLAGS_CUDA)
 
 .PHONY: pybind-hip
 pybind-hip:
@@ -119,4 +124,5 @@ clean:
 	-rm -f ./cuda/*.x ./cuda/*.a ./cuda/*.so ./cuda/*.mod $(QSIMLIB_CUDA)
 	-rm -f ./hip/*.x ./hip/*.a ./hip/*.so ./hip/*.mod $(QSIMLIB_HIP)
 	-rm -f ./custatevec/*.x ./custatevec/*.a ./custatevec/*.so ./custatevec/*.mod $(QSIMLIB_CUSTATEVEC)
+	-rm -f ./custatevecex/*.x ./custatevecex/*.a ./custatevecex/*.so ./custatevecex/*.mod $(QSIMLIB_CUSTATEVECEX)
 	-rm -f ./decide/*.x ./decide/*.a ./decide/*.so ./decide/*.mod $(QSIMLIB_DECIDE)
diff --git a/pybind_interface/cuda/CMakeLists.txt b/pybind_interface/cuda/CMakeLists.txt
index 6ef6be3cf..39d0b8bba 100644
--- a/pybind_interface/cuda/CMakeLists.txt
+++ b/pybind_interface/cuda/CMakeLists.txt
@@ -22,7 +22,7 @@ if(WIN32)
     # This prevents a conflict with /RTC1 in DEBUG builds.
     add_compile_options($<$<NOT:$<CONFIG:Debug>>:/O2>)
 else()
-    add_compile_options(-O3 -flto=auto)
+    add_compile_options(-O3 -fno-lto)
 endif()
 
 if(APPLE)
diff --git a/pybind_interface/custatevec/CMakeLists.txt b/pybind_interface/custatevec/CMakeLists.txt
index 2bdd34c12..e4a5f808c 100644
--- a/pybind_interface/custatevec/CMakeLists.txt
+++ b/pybind_interface/custatevec/CMakeLists.txt
@@ -21,7 +21,7 @@ if(WIN32)
     # This prevents a conflict with /RTC1 in DEBUG builds.
     add_compile_options($<$<NOT:$<CONFIG:Debug>>:/O2>)
 else()
-    add_compile_options(-O3 -flto=auto)
+    add_compile_options(-O3 -fno-lto)
 endif()
 
 if(APPLE)
diff --git a/pybind_interface/custatevecex/CMakeLists.txt b/pybind_interface/custatevecex/CMakeLists.txt
new file mode 100644
index 000000000..f25216af4
--- /dev/null
+++ b/pybind_interface/custatevecex/CMakeLists.txt
@@ -0,0 +1,59 @@
+# Copyright 2025 Google LLC. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+cmake_minimum_required(VERSION 3.28)
+project(qsim LANGUAGES CXX CUDA)
+
+if(WIN32)
+    add_compile_options(/openmp)
+    # Add /O2 to any configuration that is NOT Debug.
+    # This prevents a conflict with /RTC1 in DEBUG builds.
+    add_compile_options($<$<NOT:$<CONFIG:Debug>>:/O2>)
+else()
+    add_compile_options(-O3 -fno-lto)
+endif()
+
+if(APPLE)
+    include_directories(
+      "/usr/local/include"
+      "/usr/local/opt/llvm/include"
+      "/opt/homebrew/include"
+      "/opt/homebrew/opt/llvm@19/include"
+    )
+    link_directories(
+      "/usr/local/lib"
+      "/usr/local/opt/llvm/lib"
+      "/opt/homebrew/lib"
+      "/opt/homebrew/opt/llvm@19/lib"
+    )
+endif()
+
+include(../GetPybind11.cmake)
+find_package(Python3 3.10 REQUIRED)
+
+include_directories(${pybind11_INCLUDE_DIRS})
+
+include_directories($ENV{CUQUANTUM_ROOT}/include)
+link_directories($ENV{CUQUANTUM_ROOT}/lib $ENV{CUQUANTUM_ROOT}/lib64)
+
+add_library(qsim_custatevecex MODULE pybind_main_custatevecex.cpp)
+target_link_libraries(qsim_custatevecex -lcustatevec -lcublas)
+
+set_target_properties(qsim_custatevecex PROPERTIES
+    PREFIX "${PYTHON_MODULE_PREFIX}"
+    SUFFIX "${PYTHON_MODULE_EXTENSION}"
+)
+set_source_files_properties(pybind_main_custatevecex.cpp PROPERTIES LANGUAGE CUDA)
+
+target_link_libraries(qsim_custatevecex OpenMP::OpenMP_CXX)
diff --git a/pybind_interface/custatevecex/pybind_main_custatevecex.cpp b/pybind_interface/custatevecex/pybind_main_custatevecex.cpp
new file mode 100644
index 000000000..c29a608a6
--- /dev/null
+++ b/pybind_interface/custatevecex/pybind_main_custatevecex.cpp
@@ -0,0 +1,74 @@
+// Copyright 2025 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <custatevecEx.h>
+
+#include "pybind_main_custatevecex.h"
+
+#include "../../lib/fuser_mqubit.h"
+#include "../../lib/gates_cirq.h"
+#include "../../lib/io.h"
+#include "../../lib/multiprocess_custatevecex.h"
+#include "../../lib/run_custatevecex.h"
+#include "../../lib/simulator_custatevecex.h"
+
+namespace {
+
+qsim::MultiProcessCuStateVecEx mp;
+
+}  // namespace {
+
+namespace qsim {
+  using Simulator = SimulatorCuStateVecEx<float>;
+
+  struct Factory {
+    // num_sim_threads, num_state_threads and num_dblocks are unused, but kept
+    // for consistency with other factories.
+    Factory(unsigned num_sim_threads,
+            unsigned num_state_threads,
+            unsigned num_dblocks) {
+      if (!mp.initialized()) {
+        mp.initialize();
+      }
+    }
+
+    using Simulator = qsim::Simulator;
+    using StateSpace = Simulator::StateSpace;
+
+    using Gate = Cirq::GateCirq<float>;
+    using Runner = CuStateVecExRunner<IO, Factory>;
+    struct RunnerParameter : public Runner::Parameter {
+      // max_fused_size is not used, but kept for consistency.
+      unsigned max_fused_size = 2;
+    };
+    using NoisyRunner = qsim::QuantumTrajectorySimulator<IO, Gate, Runner>;
+    struct NoisyRunnerParameter : public NoisyRunner::Parameter {
+      // max_fused_size is not used, but kept for consistency.
+      unsigned max_fused_size = 2;
+    };
+
+    StateSpace CreateStateSpace() const {
+      return StateSpace{mp};
+    }
+
+    Simulator CreateSimulator() const {
+      return Simulator{};
+    }
+  };
+
+  inline void SetFlushToZeroAndDenormalsAreZeros() {}
+  inline void ClearFlushToZeroAndDenormalsAreZeros() {}
+}
+
+#include "../pybind_main.cpp"
diff --git a/pybind_interface/custatevecex/pybind_main_custatevecex.h b/pybind_interface/custatevecex/pybind_main_custatevecex.h
new file mode 100644
index 000000000..06290dda5
--- /dev/null
+++ b/pybind_interface/custatevecex/pybind_main_custatevecex.h
@@ -0,0 +1,17 @@
+// Copyright 2025 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "../pybind_main.h"
+
+PYBIND11_MODULE(qsim_custatevecex, m) { GPU_MODULE_BINDINGS }
diff --git a/pybind_interface/decide/CMakeLists.txt b/pybind_interface/decide/CMakeLists.txt
index 0c8b8d941..b48e0e587 100644
--- a/pybind_interface/decide/CMakeLists.txt
+++ b/pybind_interface/decide/CMakeLists.txt
@@ -24,7 +24,7 @@ if(WIN32)
     # This prevents a conflict with /RTC1 in DEBUG builds.
     add_compile_options($<$<NOT:$<CONFIG:Debug>>:/O2>)
 else()
-    add_compile_options(-O3 -flto=auto)
+    add_compile_options(-O3 -fno-lto)
 endif()
 
 if(APPLE)
@@ -52,6 +52,9 @@ if(CMAKE_CUDA_COMPILER)
         target_compile_options(qsim_decide PRIVATE
             $<$<COMPILE_LANGUAGE:CUDA>:-D__CUSTATEVEC__>
         )
+	target_compile_options(qsim_decide PRIVATE
+            $<$<COMPILE_LANGUAGE:CUDA>:-D__CUSTATEVECEX__>
+        )
     endif()
     find_package(Python3 3.10 REQUIRED COMPONENTS Interpreter Development)
     include_directories(${PYTHON_INCLUDE_DIRS} ${pybind11_SOURCE_DIR}/include)
diff --git a/pybind_interface/decide/decide.cpp b/pybind_interface/decide/decide.cpp
index b40f6975e..51cd52425 100644
--- a/pybind_interface/decide/decide.cpp
+++ b/pybind_interface/decide/decide.cpp
@@ -61,7 +61,8 @@ int detect_instructions() {
 }
 
 enum GPUCapabilities {
-    CUDA = 0, CUSTATEVEC = 1, HIP = 2, NO_GPU = 10, NO_CUSTATEVEC = 11 };
+    CUDA = 0, CUSTATEVEC = 1, CUSTATEVECEX = 2, HIP = 3, NO_GPU = 10,
+    NO_CUSTATEVEC = 11, NO_CUSTATEVECEX = 12 };
 
 // For now, GPU detection is performed at compile time, as our wheels are
 // generated on Github Actions runners which do not have GPU support.
@@ -93,6 +94,20 @@ int detect_custatevec() {
   return gpu;
 }
 
+// For now, cuStateVecEx detection is performed at compile time, as our wheels
+// are generated on Github Actions runners which do not have GPU support.
+//
+// Users wishing to use qsim with cuStateVecEx will need to compile locally on
+// a device which has the necessary CUDA toolkit and cuStateVecEx library.
+int detect_custatevecex() {
+  #if defined(__NVCC__) && defined(__CUSTATEVECEX__)
+  GPUCapabilities gpu = CUSTATEVECEX;
+  #else
+  GPUCapabilities gpu = NO_CUSTATEVECEX;
+  #endif
+  return gpu;
+}
+
 PYBIND11_MODULE(qsim_decide, m) {
   m.doc() = "pybind11 plugin";  // optional module docstring
 
@@ -104,4 +119,7 @@ PYBIND11_MODULE(qsim_decide, m) {
 
   // Detect cuStateVec.
   m.def("detect_custatevec", &detect_custatevec, "Detect cuStateVec");
+
+  // Detect cuStateVecEx.
+  m.def("detect_custatevecex", &detect_custatevecex, "Detect cuStateVecEx");
 }
diff --git a/pybind_interface/hip/CMakeLists.txt b/pybind_interface/hip/CMakeLists.txt
index 56f0cd0e6..4cad3230c 100644
--- a/pybind_interface/hip/CMakeLists.txt
+++ b/pybind_interface/hip/CMakeLists.txt
@@ -21,7 +21,7 @@ if(WIN32)
     # This prevents a conflict with /RTC1 in DEBUG builds.
     add_compile_options($<$<NOT:$<CONFIG:Debug>>:/O2>)
 else()
-    add_compile_options(-O3 -flto=auto)
+    add_compile_options(-O3 -fno-lto)
 endif()
 
 include(../GetPybind11.cmake)
diff --git a/qsimcirq/__init__.py b/qsimcirq/__init__.py
index 7de9cc506..4a91936c6 100644
--- a/qsimcirq/__init__.py
+++ b/qsimcirq/__init__.py
@@ -35,7 +35,7 @@ def _load_qsim_gpu():
     instr = qsim_decide.detect_gpu()
     if instr == 0:
         qsim_gpu = importlib.import_module("qsimcirq.qsim_cuda")
-    elif instr == 2:
+    elif instr == 3:
         qsim_gpu = importlib.import_module("qsimcirq.qsim_hip")
     else:
         qsim_gpu = None
@@ -51,9 +51,19 @@ def _load_qsim_custatevec():
     return qsim_custatevec
 
 
+def _load_qsim_custatevecex():
+    instr = qsim_decide.detect_custatevecex()
+    if instr == 2:
+        qsim_custatevecex = importlib.import_module("qsimcirq.qsim_custatevecex")
+    else:
+        qsim_custatevecex = None
+    return qsim_custatevecex
+
+
 qsim = _load_simd_qsim()
 qsim_gpu = _load_qsim_gpu()
 qsim_custatevec = _load_qsim_custatevec()
+qsim_custatevecex = _load_qsim_custatevecex()
 
 # Note: the following imports must remain at the bottom of this file.
 
diff --git a/qsimcirq/qsim_simulator.py b/qsimcirq/qsim_simulator.py
index b3c0106fc..240715639 100644
--- a/qsimcirq/qsim_simulator.py
+++ b/qsimcirq/qsim_simulator.py
@@ -21,7 +21,7 @@
 
 import qsimcirq.qsim_circuit as qsimc
 
-from . import qsim, qsim_custatevec, qsim_gpu
+from . import qsim, qsim_custatevec, qsim_custatevecex, qsim_gpu
 
 
 # This should probably live in Cirq...
@@ -60,9 +60,10 @@ class QSimOptions:
             simulation modes.
         use_gpu: whether to use GPU instead of CPU for simulation. The "gpu_*"
             arguments below are only considered if this is set to True.
-        gpu_mode: use CUDA if set to 0 (default value) or use the NVIDIA
-            cuStateVec library if set to any other value. The "gpu_*"
-            arguments below are only considered if this is set to 0.
+        gpu_mode: use CUDA if set to 0 (default value), use the NVIDIA
+            cuStateVec library if set to 1 or use the NVIDIA cuStateVecEx
+            library if set to any other value. The "gpu_*" arguments below are
+            only considered if this is set to 0.
         gpu_state_threads: number of threads per CUDA block to use for the GPU
             StateSpace. This must be a power of 2 in the range [32, 1024].
         gpu_data_blocks: number of data blocks to use for the GPU StateSpace.
@@ -180,16 +181,26 @@ def __init__(
                     )
                 else:
                     self._sim_module = qsim_gpu
-            else:
+            elif self.qsim_options["gmode"] == 1:
                 if qsim_custatevec is None:
                     raise ValueError(
                         "cuStateVec GPU execution requested, but not "
                         "supported. If your device has GPU support and the "
-                        "NVIDIA cuStateVec library is installed, you may need "
-                        "to compile qsim locally."
+                        "NVIDIA cuStateVec library is installed, you may "
+                        "need to compile qsim locally."
                     )
                 else:
                     self._sim_module = qsim_custatevec
+            else:
+                if qsim_custatevecex is None:
+                    raise ValueError(
+                        "cuStateVecEx GPU execution requested, but not "
+                        "supported. If your device has GPU support and the "
+                        "NVIDIA cuStateVecEx library is installed, you may "
+                        "need to compile qsim locally."
+                    )
+                else:
+                    self._sim_module = qsim_custatevecex
         else:
             self._sim_module = qsim
 
diff --git a/qsimcirq_tests/qsimcirq_test.py b/qsimcirq_tests/qsimcirq_test.py
index 37573b9c7..0b5cdbc8e 100644
--- a/qsimcirq_tests/qsimcirq_test.py
+++ b/qsimcirq_tests/qsimcirq_test.py
@@ -1533,6 +1533,115 @@ def test_qsim_custatevec_input_state():
             assert cirq.approx_eq(state_vector[i], 0, atol=1e-6)
 
 
+def test_cirq_qsim_custatevecex_amplitudes():
+    if qsimcirq.qsim_custatevecex is None:
+        pytest.skip("cuStateVecEx library is not available for testing.")
+    # Pick qubits.
+    a, b = [cirq.GridQubit(0, 0), cirq.GridQubit(0, 1)]
+
+    # Create a circuit
+    cirq_circuit = cirq.Circuit(cirq.CNOT(a, b), cirq.CNOT(b, a), cirq.X(a))
+
+    # Enable GPU acceleration.
+    custatevecex_options = qsimcirq.QSimOptions(use_gpu=True, gpu_mode=2)
+    qsimGpuSim = qsimcirq.QSimSimulator(qsim_options=custatevecex_options)
+    result = qsimGpuSim.compute_amplitudes(
+        cirq_circuit, bitstrings=[0b00, 0b01, 0b10, 0b11]
+    )
+    assert np.allclose(result, [0j, 0j, (1 + 0j), 0j])
+
+
+def test_cirq_qsim_custatevecex_simulate():
+    if qsimcirq.qsim_custatevecex is None:
+        pytest.skip("cuStateVecEx library is not available for testing.")
+    # Pick qubits.
+    a, b = [cirq.GridQubit(0, 0), cirq.GridQubit(0, 1)]
+
+    # Create a circuit
+    cirq_circuit = cirq.Circuit(cirq.H(a), cirq.CNOT(a, b), cirq.X(b))
+
+    # Enable GPU acceleration.
+    custatevecex_options = qsimcirq.QSimOptions(use_gpu=True, gpu_mode=2)
+    qsimGpuSim = qsimcirq.QSimSimulator(qsim_options=custatevecex_options)
+    result = qsimGpuSim.simulate(cirq_circuit)
+    assert result.state_vector().shape == (4,)
+
+    cirqSim = cirq.Simulator()
+    cirq_result = cirqSim.simulate(cirq_circuit)
+    assert cirq.linalg.allclose_up_to_global_phase(
+        result.state_vector(), cirq_result.state_vector(), atol=1.0e-6
+    )
+
+
+def test_cirq_qsim_custatevecex_expectation_values():
+    if qsimcirq.qsim_custatevecex is None:
+        pytest.skip("cuStateVecEx library is not available for testing.")
+    # Pick qubits.
+    a, b = [cirq.GridQubit(0, 0), cirq.GridQubit(0, 1)]
+
+    # Create a circuit
+    cirq_circuit = cirq.Circuit(cirq.H(a), cirq.CNOT(a, b), cirq.X(b))
+    obs = [cirq.Z(a) * cirq.Z(b)]
+
+    # Enable GPU acceleration.
+    custatevecex_options = qsimcirq.QSimOptions(use_gpu=True, gpu_mode=2)
+    qsimGpuSim = qsimcirq.QSimSimulator(qsim_options=custatevecex_options)
+    result = qsimGpuSim.simulate_expectation_values(cirq_circuit, obs)
+
+    cirqSim = cirq.Simulator()
+    cirq_result = cirqSim.simulate_expectation_values(cirq_circuit, obs)
+    assert np.allclose(result, cirq_result)
+
+
+def test_cirq_qsim_custatevecex_input_state():
+    if qsimcirq.qsim_custatevecex is None:
+        pytest.skip("cuStateVecEx library is not available for testing.")
+    # Pick qubits.
+    a, b = [cirq.GridQubit(0, 0), cirq.GridQubit(0, 1)]
+
+    # Create a circuit
+    cirq_circuit = cirq.Circuit(cirq.H(a), cirq.CNOT(a, b), cirq.X(b))
+
+    # Enable GPU acceleration.
+    custatevecex_options = qsimcirq.QSimOptions(use_gpu=True, gpu_mode=2)
+    qsimGpuSim = qsimcirq.QSimSimulator(qsim_options=custatevecex_options)
+    initial_state = np.asarray([0.5] * 4, dtype=np.complex64)
+    result = qsimGpuSim.simulate(cirq_circuit, initial_state=initial_state)
+    assert result.state_vector().shape == (4,)
+
+    cirqSim = cirq.Simulator()
+    cirq_result = cirqSim.simulate(cirq_circuit, initial_state=initial_state)
+    assert cirq.linalg.allclose_up_to_global_phase(
+        result.state_vector(), cirq_result.state_vector(), atol=1.0e-6
+    )
+
+
+def test_qsim_custatevecex_input_state():
+    if qsimcirq.qsim_custatevecex is None:
+        pytest.skip("cuStateVecEx library is not available for testing.")
+
+    for num_qubits in range(1, 8):
+        size = 2**num_qubits
+        qubits = cirq.LineQubit.range(num_qubits)
+        circuit = cirq.Circuit()
+
+        for k in range(num_qubits):
+            circuit.append(cirq.H(qubits[k]))
+
+        # Enable GPU acceleration.
+        custatevecex_options = qsimcirq.QSimOptions(use_gpu=True, gpu_mode=2)
+        qsimGpuSim = qsimcirq.QSimSimulator(qsim_options=custatevecex_options)
+        initial_state = np.asarray([np.sqrt(1.0 / size)] * size, dtype=np.complex64)
+        result = qsimGpuSim.simulate(circuit, initial_state=initial_state)
+        state_vector = result.state_vector()
+
+        assert result.state_vector().shape == (size,)
+        assert cirq.approx_eq(state_vector[0], 1, atol=1e-6)
+
+        for i in range(1, size):
+            assert cirq.approx_eq(state_vector[i], 0, atol=1e-6)
+
+
 def test_cirq_qsim_old_options():
     old_options = {"f": 3, "t": 4, "r": 100, "v": 1}
     old_sim = qsimcirq.QSimSimulator(qsim_options=old_options)
diff --git a/setup.py b/setup.py
index d3ccfa54c..2ac063ee8 100644
--- a/setup.py
+++ b/setup.py
@@ -136,6 +136,7 @@ def build_extension(self, ext):
         CMakeExtension("qsimcirq/qsim_basic"),
         CMakeExtension("qsimcirq/qsim_cuda"),
         CMakeExtension("qsimcirq/qsim_custatevec"),
+        CMakeExtension("qsimcirq/qsim_custatevecex"),
         CMakeExtension("qsimcirq/qsim_decide"),
         CMakeExtension("qsimcirq/qsim_hip"),
     ],
diff --git a/tests/Makefile b/tests/Makefile
index e09b5fc4b..46f35e492 100644
--- a/tests/Makefile
+++ b/tests/Makefile
@@ -47,6 +47,9 @@ CUDA_TARGETS := $(CUDA_FILES:%cuda_test.cu=%cuda_test.x)
 CUSTATEVEC_FILES := $(wildcard *custatevec_test.cu)
 CUSTATEVEC_TARGETS := $(CUSTATEVEC_FILES:%custatevec_test.cu=%custatevec_test.x)
 
+CUSTATEVECEX_FILES := $(wildcard *custatevecex_test.cu)
+CUSTATEVECEX_TARGETS := $(CUSTATEVECEX_FILES:%custatevecex_test.cu=%custatevecex_test.x)
+
 HIP_FILES := $(wildcard *cuda_test.cu)
 HIP_TARGETS := $(HIP_FILES:%cuda_test.cu=%hip_test.x)
 
@@ -66,6 +69,9 @@ cuda-tests: $(CUDA_TARGETS)
 .PHONY: custatevec-tests
 custatevec-tests: $(CUSTATEVEC_TARGETS)
 
+.PHONY: custatevecex-tests
+custatevecex-tests: $(CUSTATEVECEX_TARGETS)
+
 .PHONY: hip-tests
 hip-tests: $(HIP_TARGETS)
 
@@ -81,6 +87,14 @@ run-cuda-tests: | $(GTEST_DIR)/build cuda-tests
 run-custatevec-tests: | $(GTEST_DIR)/build custatevec-tests
 	for exe in $(CUSTATEVEC_TARGETS); do if ! ./$$exe; then exit 1; fi; done
 
+.PHONY: run-custatevecex-tests
+run-custatevecex-tests: | $(GTEST_DIR)/build custatevecex-tests
+	for exe in $(CUSTATEVECEX_TARGETS); do if ! ./$$exe; then exit 1; fi; done
+
+.PHONY: run-custatevecex-mpi-tests
+run-custatevecex-mpi-tests: | $(GTEST_DIR)/build custatevecex-tests
+	for exe in $(CUSTATEVECEX_TARGETS); do if ! mpirun -np 2 ./$$exe; then exit 1; fi; done
+
 .PHONY: run-hip-tests
 run-hip-tests: | $(GTEST_DIR)/build hip-tests
 	for exe in $(HIP_TARGETS); do if ! ./$$exe; then exit 1; fi; done
@@ -100,6 +114,9 @@ $(GTEST_DIR)/build:
 %custatevec_test.x: %custatevec_test.cu $(GTEST_DIR)/build
 	$(NVCC) -o ./$@ $< $(TESTFLAGS) $(NVCCFLAGS) $(CUSTATEVECFLAGS)
 
+%custatevecex_test.x: %custatevecex_test.cu $(GTEST_DIR)/build
+	$(NVCC) -o ./$@ $< $(TESTFLAGS) $(NVCCFLAGS) $(CUSTATEVECFLAGS)
+
 %hip_test.x: %cuda_test.cu $(GTEST_DIR)/build
 	$(HIPCC) -o ./$@ $< $(TESTFLAGS) $(HIPCCFLAGS)
 
diff --git a/tests/hybrid_custatevecex_test.cu b/tests/hybrid_custatevecex_test.cu
new file mode 100644
index 000000000..a0c75b031
--- /dev/null
+++ b/tests/hybrid_custatevecex_test.cu
@@ -0,0 +1,59 @@
+// Copyright 2025 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "hybrid_testfixture.h"
+
+#include <custatevecEx.h>
+
+#include "gtest/gtest.h"
+
+#include "../lib/multiprocess_custatevecex.h"
+#include "../lib/simulator_custatevecex.h"
+
+namespace qsim {
+
+MultiProcessCuStateVecEx mp;
+
+template <typename FP>
+struct Factory {
+  using fp_type = FP;
+  using Simulator = qsim::SimulatorCuStateVecEx<fp_type>;
+  using StateSpace = typename Simulator::StateSpace;
+
+  StateSpace CreateStateSpace() const {
+    return StateSpace{mp};
+  }
+
+  Simulator CreateSimulator() const {
+    return Simulator{};
+  }
+};
+
+TEST(HybridCuStateVecExTest, Hybrid2) {
+  TestHybrid2(qsim::Factory<float>());
+}
+
+TEST(HybridCuStateVecExTest, Hybrid4) {
+  TestHybrid4(qsim::Factory<float>());
+}
+
+}  // namespace qsim
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+
+  qsim::mp.initialize();
+
+  return RUN_ALL_TESTS();
+}
diff --git a/tests/qtrajectory_custatevecex_test.cu b/tests/qtrajectory_custatevecex_test.cu
new file mode 100644
index 000000000..8d70bfc00
--- /dev/null
+++ b/tests/qtrajectory_custatevecex_test.cu
@@ -0,0 +1,88 @@
+// Copyright 2025 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "qtrajectory_testfixture.h"
+
+#include <custatevecEx.h>
+
+#include "gtest/gtest.h"
+
+#include "../lib/io.h"
+#include "../lib/multiprocess_custatevecex.h"
+#include "../lib/run_custatevecex.h"
+#include "../lib/simulator_custatevecex.h"
+
+namespace qsim {
+
+MultiProcessCuStateVecEx mp;
+
+template <typename FP>
+struct Factory {
+  using fp_type = FP;
+  using Simulator = qsim::SimulatorCuStateVecEx<fp_type>;
+  using StateSpace = typename Simulator::StateSpace;
+
+  StateSpace CreateStateSpace() const {
+    return StateSpace{mp};
+  }
+
+  Simulator CreateSimulator() const {
+    return Simulator{};
+  }
+};
+
+TEST(QTrajectoryCuStateVecExTest, BitFlip) {
+  using Runner = CuStateVecExRunner<IO, Factory<float>>;
+  TestBitFlip<Runner>(Factory<float>());
+}
+
+TEST(QTrajectoryCuStateVecExTest, GenDump) {
+  using Runner = CuStateVecExRunner<IO, Factory<float>>;
+  TestGenDump<Runner>(Factory<float>());
+}
+
+TEST(QTrajectoryCuStateVecExTest, ReusingResults) {
+  using Runner = CuStateVecExRunner<IO, Factory<float>>;
+  TestReusingResults<Runner>(Factory<float>());
+}
+
+TEST(QTrajectoryCuStateVecExTest, CollectKopStat) {
+  using Runner = CuStateVecExRunner<IO, Factory<float>>;
+  TestCollectKopStat<Runner>(Factory<float>());
+}
+
+TEST(QTrajectoryCuStateVecExTest, CleanCircuit) {
+  using Runner = CuStateVecExRunner<IO, Factory<float>>;
+  TestCleanCircuit<Runner>(Factory<float>());
+}
+
+TEST(QTrajectoryCuStateVecExTest, InitialState) {
+  using Runner = CuStateVecExRunner<IO, Factory<float>>;
+  TestInitialState<Runner>(Factory<float>());
+}
+
+TEST(QTrajectoryCuStateVecExTest, UncomputeFinalState) {
+  using Runner = CuStateVecExRunner<IO, Factory<float>>;
+  TestUncomputeFinalState<Runner>(Factory<float>());
+}
+
+}  // namespace qsim
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+
+  qsim::mp.initialize();
+
+  return RUN_ALL_TESTS();
+}
diff --git a/tests/run_custatevecex_test.cu b/tests/run_custatevecex_test.cu
new file mode 100644
index 000000000..079fd2696
--- /dev/null
+++ b/tests/run_custatevecex_test.cu
@@ -0,0 +1,262 @@
+// Copyright 2025 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cmath>
+#include <complex>
+#include <cstdint>
+#include <sstream>
+
+#include "gates_cirq_testfixture.h"
+
+#include <custatevecEx.h>
+
+#include "gtest/gtest.h"
+
+#include "../lib/circuit_qsim_parser.h"
+#include "../lib/gates_qsim.h"
+#include "../lib/io.h"
+#include "../lib/multiprocess_custatevecex.h"
+#include "../lib/run_custatevecex.h"
+#include "../lib/simulator_custatevecex.h"
+
+namespace qsim {
+
+constexpr char provider[] = "run_custatevecex_test";
+
+constexpr char circuit_string[] =
+R"(4
+0 h 0
+0 h 1
+0 h 2
+0 h 3
+1 cz 0 1
+1 cz 2 3
+2 t 0
+2 x 1
+2 y 2
+2 t 3
+3 y 0
+3 cz 1 2
+3 x 3
+4 t 1
+4 t 2
+5 cz 1 2
+6 x 1
+6 y 2
+7 cz 1 2
+8 t 1
+8 t 2
+9 cz 0 1
+9 cz 2 3
+10 h 0
+10 h 1
+10 h 2
+10 h 3
+)";
+
+MultiProcessCuStateVecEx mp;
+
+struct Factory {
+  using Simulator = qsim::SimulatorCuStateVecEx<float>;
+  using StateSpace = typename Simulator::StateSpace;
+
+  StateSpace CreateStateSpace() const {
+    return StateSpace{mp};
+  }
+
+  Simulator CreateSimulator() const {
+    return Simulator{};
+  }
+};
+
+TEST(RunQSimTest, QSimRunner1) {
+  std::stringstream ss(circuit_string);
+  Circuit<GateQSim<float>> circuit;
+
+  EXPECT_TRUE(CircuitQsimParser<IO>::FromStream(99, provider, ss, circuit));
+  EXPECT_EQ(circuit.num_qubits, 4);
+  EXPECT_EQ(circuit.gates.size(), 27);
+
+  using Simulator = Factory::Simulator;
+  using StateSpace = Simulator::StateSpace;
+  using State = StateSpace::State;
+  using Runner = CuStateVecExRunner<IO, Factory>;
+
+  float entropy = 0;
+
+  auto measure = [&entropy](
+      unsigned k, const StateSpace& state_space, const State& state) {
+    // Calculate entropy.
+
+    entropy = 0;
+    auto size = uint64_t{1} << state.num_qubits();
+
+    for (uint64_t i = 0; i < size; ++i) {
+      auto ampl = state_space.GetAmpl(state, i);
+      float p = std::norm(ampl);
+      entropy -= p * std::log(p);
+    }
+  };
+
+  Runner::Parameter param;
+  param.seed = 1;
+  param.verbosity = 0;
+
+  EXPECT_TRUE(Runner::Run(param, Factory(), circuit, measure));
+
+  EXPECT_NEAR(entropy, 2.2192848, 1e-6);
+}
+
+TEST(RunQSimTest, QSimRunner2) {
+  std::stringstream ss(circuit_string);
+  Circuit<GateQSim<float>> circuit;
+
+  EXPECT_TRUE(CircuitQsimParser<IO>::FromStream(99, provider, ss, circuit));
+  EXPECT_EQ(circuit.num_qubits, 4);
+  EXPECT_EQ(circuit.gates.size(), 27);
+
+  using Simulator = Factory::Simulator;
+  using StateSpace = Simulator::StateSpace;
+  using State = StateSpace::State;
+  using Runner = CuStateVecExRunner<IO, Factory>;
+
+  Factory factory;
+  StateSpace state_space = factory.CreateStateSpace();
+  State state = state_space.Create(circuit.num_qubits);
+
+  EXPECT_FALSE(state_space.IsNull(state));
+
+  state_space.SetStateZero(state);
+
+  Runner::Parameter param;
+  param.seed = 1;
+  param.verbosity = 0;
+
+  EXPECT_TRUE(Runner::Run(param, Factory(), circuit, state));
+
+  // Calculate entropy.
+
+  float entropy = 0;
+  auto size = uint64_t{1} << circuit.num_qubits;
+
+  for (uint64_t i = 0; i < size; ++i) {
+    auto ampl = state_space.GetAmpl(state, i);
+    float p = std::norm(ampl);
+    entropy -= p * std::log(p);
+  }
+
+  EXPECT_NEAR(entropy, 2.2192848, 1e-6);
+}
+
+constexpr char sample_circuit_string[] =
+R"(2
+0 h 0
+0 x 1
+1 m 1
+2 cx 0 1
+3 m 0 1
+4 m 0
+5 cx 1 0
+6 m 0
+7 x 0
+7 h 1
+8 m 0 1
+)";
+
+TEST(RunQSimTest, QSimSampler) {
+  std::stringstream ss(sample_circuit_string);
+  Circuit<GateQSim<float>> circuit;
+
+  EXPECT_TRUE(CircuitQsimParser<IO>::FromStream(99, provider, ss, circuit));
+  EXPECT_EQ(circuit.num_qubits, 2);
+  EXPECT_EQ(circuit.gates.size(), 11);
+
+  using Simulator = Factory::Simulator;
+  using StateSpace = Simulator::StateSpace;
+  using Result = StateSpace::MeasurementResult;
+  using State = StateSpace::State;
+  using Runner = CuStateVecExRunner<IO, Factory>;
+
+  Factory factory;
+  StateSpace state_space = factory.CreateStateSpace();
+  State state = state_space.Create(circuit.num_qubits);
+
+  EXPECT_FALSE(state_space.IsNull(state));
+
+  state_space.SetStateZero(state);
+
+  std::vector<Result> results;
+
+  Runner::Parameter param;
+  param.seed = 1;
+  param.verbosity = 0;
+
+  EXPECT_TRUE(Runner::Run(param, Factory(), circuit, state, results));
+
+  // Results should contain (qubit @ time):
+  // (1 @ 1) - should be |01)
+  EXPECT_TRUE(results[0].bitstring[0]);
+  // (0 @ 3), (1 @ 3) - either |01) or |10)
+  EXPECT_EQ(results[1].bitstring[0], !results[1].bitstring[1]);
+  // (0 @ 4) - should match (0 @ 3)
+  EXPECT_EQ(results[1].bitstring[0], results[2].bitstring[0]);
+  // (0 @ 6) - either |11) or |10)
+  EXPECT_TRUE(results[3].bitstring[0]);
+  // (0 @ 8), (1 @ 8) - should be |00)
+  EXPECT_FALSE(results[4].bitstring[0]);
+  EXPECT_FALSE(results[4].bitstring[1]);
+}
+
+TEST(RunQSimTest, CirqGates) {
+  auto circuit = CirqCircuit1::GetCircuit<float>(false);
+  const auto& expected_results = CirqCircuit1::expected_results0;
+
+  using Simulator = Factory::Simulator;
+  using StateSpace = Simulator::StateSpace;
+  using State = StateSpace::State;
+  using Runner = CuStateVecExRunner<IO, Factory>;
+
+  Factory factory;
+  StateSpace state_space = factory.CreateStateSpace();
+  State state = state_space.Create(circuit.num_qubits);
+
+  auto size = uint64_t{1} << circuit.num_qubits;
+
+  EXPECT_FALSE(state_space.IsNull(state));
+  EXPECT_EQ(size, expected_results.size());
+
+  state_space.SetStateZero(state);
+
+  Runner::Parameter param;
+  param.seed = 1;
+  param.verbosity = 0;
+
+  EXPECT_TRUE(Runner::Run(param, Factory(), circuit, state));
+
+  for (uint64_t i = 0; i < size; ++i) {
+    auto ampl = state_space.GetAmpl(state, i);
+    EXPECT_NEAR(std::real(ampl), std::real(expected_results[i]), 2e-6);
+    EXPECT_NEAR(std::imag(ampl), std::imag(expected_results[i]), 2e-6);
+  }
+}
+
+}  // namespace qsim
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+
+  qsim::mp.initialize();
+
+  return RUN_ALL_TESTS();
+}
diff --git a/tests/simulator_custatevecex_test.cu b/tests/simulator_custatevecex_test.cu
new file mode 100644
index 000000000..dcf9eaf65
--- /dev/null
+++ b/tests/simulator_custatevecex_test.cu
@@ -0,0 +1,105 @@
+// Copyright 2025 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "simulator_testfixture.h"
+
+#include <custatevecEx.h>
+
+#include <type_traits>
+
+#include "gtest/gtest.h"
+
+#include "../lib/multiprocess_custatevecex.h"
+#include "../lib/simulator_custatevecex.h"
+
+namespace qsim {
+
+template <class T>
+class SimulatorCuStateVecExTest : public testing::Test {};
+
+//using fp_impl = ::testing::Types<float, double>;
+using fp_impl = ::testing::Types<float>;
+
+TYPED_TEST_SUITE(SimulatorCuStateVecExTest, fp_impl);
+
+MultiProcessCuStateVecEx mp;
+
+template <typename fp_type>
+struct Factory {
+  using Simulator = qsim::SimulatorCuStateVecEx<fp_type>;
+  using StateSpace = typename Simulator::StateSpace;
+
+  StateSpace CreateStateSpace() const {
+    return StateSpace{mp};
+  }
+
+  Simulator CreateSimulator() const {
+    return Simulator{};
+  }
+};
+
+TYPED_TEST(SimulatorCuStateVecExTest, ApplyGate1) {
+  TestApplyGate1(qsim::Factory<TypeParam>());
+}
+
+TYPED_TEST(SimulatorCuStateVecExTest, ApplyGate2) {
+  TestApplyGate2(qsim::Factory<TypeParam>());
+}
+
+TYPED_TEST(SimulatorCuStateVecExTest, ApplyGate3) {
+  TestApplyGate3(qsim::Factory<TypeParam>());
+}
+
+TYPED_TEST(SimulatorCuStateVecExTest, ApplyGate5) {
+  TestApplyGate5(qsim::Factory<TypeParam>());
+}
+
+TYPED_TEST(SimulatorCuStateVecExTest, CircuitWithControlledGates) {
+  TestCircuitWithControlledGates(qsim::Factory<TypeParam>());
+}
+
+TYPED_TEST(SimulatorCuStateVecExTest, CircuitWithControlledGatesDagger) {
+  TestCircuitWithControlledGatesDagger(qsim::Factory<TypeParam>());
+}
+
+TYPED_TEST(SimulatorCuStateVecExTest, MultiQubitGates) {
+  TestMultiQubitGates(qsim::Factory<TypeParam>());
+}
+
+TYPED_TEST(SimulatorCuStateVecExTest, ControlledGates) {
+  bool high_precision = std::is_same<TypeParam, double>::value;
+  TestControlledGates(qsim::Factory<TypeParam>(), high_precision, true);
+}
+
+TYPED_TEST(SimulatorCuStateVecExTest, GlobalPhaseGate) {
+  TestGlobalPhaseGate(qsim::Factory<TypeParam>());
+}
+
+TYPED_TEST(SimulatorCuStateVecExTest, ExpectationValue1) {
+  TestExpectationValue1(qsim::Factory<TypeParam>());
+}
+
+TYPED_TEST(SimulatorCuStateVecExTest, ExpectationValue2) {
+  TestExpectationValue2(qsim::Factory<TypeParam>());
+}
+
+}  // namespace qsim
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+
+  qsim::mp.initialize();
+
+  return RUN_ALL_TESTS();
+}
diff --git a/tests/simulator_testfixture.h b/tests/simulator_testfixture.h
index 2e15c287f..bf46353a7 100644
--- a/tests/simulator_testfixture.h
+++ b/tests/simulator_testfixture.h
@@ -362,7 +362,8 @@ void TestCircuitWithControlledGates(const Factory& factory) {
   using fp_type = typename StateSpace::fp_type;
   using Gate = GateQSim<fp_type>;
 
-  unsigned num_qubits = 6;
+  unsigned num_qubits = 7;
+  unsigned size = 1 << (num_qubits - 1);
 
   std::vector<Gate> gates;
   gates.reserve(128);
@@ -722,10 +723,8 @@ if __name__ == '__main__':
     {-0.18774915, 0.12311842},
   };
 
-  unsigned size = 1 << num_qubits;
-
   for (unsigned i = 0; i < size; ++i) {
-    auto a = StateSpace::GetAmpl(state1, i);
+    auto a = state_space.GetAmpl(state1, i);
     EXPECT_NEAR(std::real(a), expected_results[i][0], 1e-6);
     EXPECT_NEAR(std::imag(a), expected_results[i][1], 1e-6);
   }
@@ -740,8 +739,8 @@ if __name__ == '__main__':
   }
 
   for (unsigned i = 0; i < size; ++i) {
-    auto a1 = StateSpace::GetAmpl(state1, i);
-    auto a2 = StateSpace::GetAmpl(state2, i);
+    auto a1 = state_space.GetAmpl(state1, i);
+    auto a2 = state_space.GetAmpl(state2, i);
     EXPECT_EQ(std::real(a1), std::real(a2));
     EXPECT_EQ(std::imag(a1), std::imag(a2));
   }
@@ -756,8 +755,8 @@ if __name__ == '__main__':
   }
 
   for (unsigned i = 0; i < size; ++i) {
-    auto a1 = StateSpace::GetAmpl(state1, i);
-    auto a2 = StateSpace::GetAmpl(state3, i);
+    auto a1 = state_space.GetAmpl(state1, i);
+    auto a2 = state_space.GetAmpl(state3, i);
     EXPECT_EQ(std::real(a1), std::real(a2));
     EXPECT_EQ(std::imag(a1), std::imag(a2));
   }
@@ -770,8 +769,8 @@ void TestCircuitWithControlledGatesDagger(const Factory& factory) {
   using fp_type = typename StateSpace::fp_type;
   using Gate = GateQSim<fp_type>;
 
-  unsigned num_qubits = 6;
-  unsigned size = 1 << num_qubits;
+  unsigned num_qubits = 7;
+  unsigned size = 1 << (num_qubits - 1);
 
   std::vector<Gate> gates;
   gates.reserve(128);
@@ -1133,10 +1132,10 @@ if __name__ == '__main__':
 
 */
 
-  EXPECT_NEAR(std::real(StateSpace::GetAmpl(state, 0)), 1, 1e-6);
-  EXPECT_NEAR(std::imag(StateSpace::GetAmpl(state, 0)), 0, 1e-6);
+  EXPECT_NEAR(std::real(state_space.GetAmpl(state, 0)), 1, 1e-6);
+  EXPECT_NEAR(std::imag(state_space.GetAmpl(state, 0)), 0, 1e-6);
   for (unsigned i = 1; i < size; ++i) {
-    auto a = StateSpace::GetAmpl(state, i);
+    auto a = state_space.GetAmpl(state, i);
     EXPECT_NEAR(std::real(a), 0, 1e-6);
     EXPECT_NEAR(std::imag(a), 0, 1e-6);
   }
@@ -1162,14 +1161,14 @@ void TestMultiQubitGates(const Factory& factory) {
 
   std::vector<fp_type> vec(state_space.MinSize(max_num_qubits));
 
-  for (unsigned num_qubits = 1; num_qubits <= max_num_qubits; ++num_qubits) {
+  for (unsigned num_qubits = 2; num_qubits <= max_num_qubits; ++num_qubits) {
     auto state = state_space.Create(num_qubits);
 
     unsigned size = 1 << num_qubits;
     fp_type inorm = std::sqrt(1.0 / (1 << num_qubits));
-    unsigned max_gate_qubits2 = std::min(max_gate_qubits, num_qubits);
+    unsigned max_gate_qubits2 = std::min(max_gate_qubits, num_qubits - 1);
 
-    for (unsigned q = 0; q <= max_gate_qubits2; ++q) {
+    for (unsigned q = 1; q <= max_gate_qubits2; ++q) {
       unsigned size1 = 1 << q;
       unsigned size2 = size1 * size1;
 
@@ -1432,10 +1431,10 @@ void TestExpectationValue1(const Factory& factory) {
 
   std::vector<fp_type> vec(state_space.MinSize(max_num_qubits));
 
-  for (unsigned num_qubits = 1; num_qubits <= max_num_qubits; ++num_qubits) {
+  for (unsigned num_qubits = 2; num_qubits <= max_num_qubits; ++num_qubits) {
     auto state = state_space.Create(num_qubits);
 
-    unsigned max_gate_qubits2 = std::min(max_gate_qubits, num_qubits);
+    unsigned max_gate_qubits2 = std::min(max_gate_qubits, num_qubits - 1);
 
     for (unsigned q = 1; q <= max_gate_qubits2; ++q) {
       unsigned size1 = 1 << q;
diff --git a/tests/statespace_custatevecex_test.cu b/tests/statespace_custatevecex_test.cu
new file mode 100644
index 000000000..db840d7c9
--- /dev/null
+++ b/tests/statespace_custatevecex_test.cu
@@ -0,0 +1,119 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "statespace_testfixture.h"
+
+#include <custatevecEx.h>
+
+#include "gtest/gtest.h"
+
+#include "../lib/multiprocess_custatevecex.h"
+#include "../lib/simulator_custatevecex.h"
+#include "../lib/statespace_custatevecex.h"
+
+namespace qsim {
+
+template <class T>
+class StateSpaceCuStateVecExTest : public testing::Test {};
+
+using fp_impl = ::testing::Types<float, double>;
+
+TYPED_TEST_SUITE(StateSpaceCuStateVecExTest, fp_impl);
+
+MultiProcessCuStateVecEx mp;
+
+template <typename fp_type>
+struct Factory {
+  using Simulator = qsim::SimulatorCuStateVecEx<fp_type>;
+  using StateSpace = typename Simulator::StateSpace;
+
+  StateSpace CreateStateSpace() const {
+    return StateSpace{mp};
+  }
+
+  Simulator CreateSimulator() const {
+    return Simulator{};
+  }
+};
+
+TYPED_TEST(StateSpaceCuStateVecExTest, Add) {
+  TestAdd(qsim::Factory<TypeParam>());
+}
+
+TYPED_TEST(StateSpaceCuStateVecExTest, NormSmall) {
+  TestNormSmall(qsim::Factory<TypeParam>());
+}
+
+TYPED_TEST(StateSpaceCuStateVecExTest, NormAndInnerProductSmall) {
+  TestNormAndInnerProductSmall(qsim::Factory<TypeParam>());
+}
+
+TYPED_TEST(StateSpaceCuStateVecExTest, NormAndInnerProduct) {
+  TestNormAndInnerProduct(qsim::Factory<TypeParam>());
+}
+
+TYPED_TEST(StateSpaceCuStateVecExTest, SamplingSmall) {
+  TestSamplingSmall(qsim::Factory<TypeParam>());
+}
+
+TYPED_TEST(StateSpaceCuStateVecExTest, SamplingCrossEntropyDifference) {
+  TestSamplingCrossEntropyDifference(qsim::Factory<TypeParam>());
+}
+
+TYPED_TEST(StateSpaceCuStateVecExTest, Ordering) {
+  TestOrdering(qsim::Factory<TypeParam>());
+}
+
+TEST(StateSpaceCuStateVecExTest, MeasurementSmall) {
+  TestMeasurementSmall(qsim::Factory<float>(), true);
+}
+
+TYPED_TEST(StateSpaceCuStateVecExTest, MeasurementLarge) {
+//  This test fails.
+//  TestMeasurementLarge(qsim::Factory<TypeParam>());
+}
+
+TYPED_TEST(StateSpaceCuStateVecExTest, Collapse) {
+//  Not implemented.
+//  TestCollapse(qsim::Factory<TypeParam>());
+}
+
+TEST(StateSpaceCuStateVecExTest, InvalidStateSize) {
+  TestInvalidStateSize(qsim::Factory<float>());
+}
+
+TYPED_TEST(StateSpaceCuStateVecExTest, BulkSetAmpl) {
+//  Not implemented.
+//  TestBulkSetAmplitude(qsim::Factory<TypeParam>());
+}
+
+TYPED_TEST(StateSpaceCuStateVecExTest, BulkSetAmplExclusion) {
+//  Not implemented.
+//  TestBulkSetAmplitudeExclusion(qsim::Factory<TypeParam>());
+}
+
+TYPED_TEST(StateSpaceCuStateVecExTest, BulkSetAmplDefault) {
+//  Not implemented.
+//  TestBulkSetAmplitudeDefault(factory);
+}
+
+}  // namespace qsim
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+
+  qsim::mp.initialize();
+
+  return RUN_ALL_TESTS();
+}

From 6b95f3aba3be3b3fce21b3b700cd9cf636113c27 Mon Sep 17 00:00:00 2001
From: Michael Hucka <mhucka@google.com>
Date: Mon, 5 Jan 2026 09:21:30 -0800
Subject: [PATCH 12/33] Make use of larger GitHub job runners (#971)

This updates some of the jobs in the CI workflow to use the larger
runners we've configured for our org, so that CI runs faster.
---
 .github/workflows/ci.yaml | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
index e0968df6a..6185674a0 100644
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@@ -221,10 +221,10 @@ jobs:
         # tests the endpoints of the supported range. The wheel build process
         # (in a separate workflow) *does* use all the versions.
         os:
-          - ubuntu-24.04
+          - ubuntu-24.04-x64-8-core
           - macos-14
           - macos-15
-          - windows-2025
+          - windows-2025-x64-8-core
         python_version:
           - '3.10'
           - '3.13'
@@ -257,13 +257,13 @@ jobs:
         with:
           debug: ${{inputs.debug}}
 
-      - if: matrix.os != 'windows-2025'
+      - if: ${{! startsWith(matrix.os, 'windows-')}}
         name: Build the qsim C++ library and run tests (non-Windows case)
         run: |
           alias bazel=bazelisk
           dev_tools/test_libs.sh ${{inputs.debug && '--config=verbose'}}
 
-      - if: matrix.os == 'windows-2025'
+      - if: ${{startsWith(matrix.os, 'windows-')}}
         name: Build the qsim C++ library and run tests (Windows case)
         # On GitHub Windows runners, Bazel ends up finding a different "python3"
         # binary than what's installed by setup-python unless we tell Bazel what
@@ -311,7 +311,7 @@ jobs:
       - python-checks
       - shell-lint
       - yaml-lint
-    runs-on: ubuntu-24.04
+    runs-on: ubuntu-24.04-x64-16-core
     timeout-minutes: 60
     strategy:
       matrix:
@@ -371,7 +371,7 @@ jobs:
       - python-checks
       - shell-lint
       - yaml-lint
-    runs-on: ubuntu-24.04
+    runs-on: ubuntu-24.04-x64-16-core
     timeout-minutes: 60
     env:
       common_args: >-
@@ -431,7 +431,7 @@ jobs:
       - python-checks
       - shell-lint
       - yaml-lint
-    runs-on: ubuntu-24.04
+    runs-on: ubuntu-24.04-x64-16-core
     timeout-minutes: 60
     env:
       # The next environment variable is used by Docker.

From 567fb255d4e13dc4224828960625194d72b103a5 Mon Sep 17 00:00:00 2001
From: Michael Hucka <mhucka@google.com>
Date: Mon, 5 Jan 2026 09:23:14 -0800
Subject: [PATCH 13/33] Add check/pylint (#978)

This is a simple script to help contributors easily run pylint
recursively on the right directories. It also passes `--jobs=0` to make
it as fast as possible by default.
---
 check/pylint | 39 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)
 create mode 100755 check/pylint

diff --git a/check/pylint b/check/pylint
new file mode 100755
index 000000000..5fd9d9cdd
--- /dev/null
+++ b/check/pylint
@@ -0,0 +1,39 @@
+#!/bin/bash
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Summary: make it easy to run pylint on directories that contain Python files
+# and avoid tests/googletest/.
+
+set -e
+
+declare -r usage="Usage: ${0} [-h | --help] [args ...]
+
+If the first argument on the command line is the option --help or -h, this
+program prints usage information and then exits. Otherwise, it runs Pylint on
+the Python files of this project. It passes all command-line arguments (other
+than -h, --help, or help) to Pylint along with the project source directories."
+
+# Exit early if the user requested help.
+if [[ "${1}" == "-h" || "${1}" == "--help" || "${1}" == "help" ]]; then
+    echo "$usage"
+    exit 0
+fi
+
+# Go to the project root.
+thisdir=$(dirname "${BASH_SOURCE[0]:?}")
+repo_dir=$(git -C "${thisdir}" rev-parse --show-toplevel)
+cd "${repo_dir}"
+
+pylint --jobs=0 --ignore-paths=tests/googletest "$@" .

From 59e15b8518b2ed78c9e11be27aa870098553697d Mon Sep 17 00:00:00 2001
From: Michael Hucka <mhucka@google.com>
Date: Mon, 5 Jan 2026 09:34:41 -0800
Subject: [PATCH 14/33] Use GHA "working-directory" instead of cd'ing (#969)

A better practice for jobs where the first step is to `cd` into a
directory is to use the field `working-directory` in the step definition
and skip the `cd`.
---
 .github/workflows/ci.yaml | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
index 6185674a0..c574625f4 100644
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@@ -461,9 +461,8 @@ jobs:
         run: docker run --rm qsim-base:latest -c /qsim/circuits/circuit_q24
 
       - name: Test installation process
-        run: |
-          cd install/tests
-          docker compose build
+        working-directory: install/tests
+        run: docker compose build
 
   report-results:
     name: CI

From 2fe8c97b83404d4a3f91a41446a9fc68061e1d1a Mon Sep 17 00:00:00 2001
From: Michael Hucka <mhucka@google.com>
Date: Mon, 5 Jan 2026 09:35:16 -0800
Subject: [PATCH 15/33] Detect missing cpuinfo package when running
 test-libs.sh and let the user know (#967)

Due to the fact that the package name is not the same as the module name
(`py-cpuinfo` vs `cpuinfo`), the error that results from trying to run
this script when `py-cpuinfo` is not installed can be confusing. Let's
help users & developers by testing for the package and pointing them in
the right direction.

Also, slightly improve the clarity of the usage message text in this
script.

---------

Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 dev_tools/test_libs.sh | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/dev_tools/test_libs.sh b/dev_tools/test_libs.sh
index c640f91a8..7efa58f73 100755
--- a/dev_tools/test_libs.sh
+++ b/dev_tools/test_libs.sh
@@ -16,11 +16,15 @@
 set -eo pipefail -o errtrace
 
 declare -r usage="Usage: ${0##*/} [-h | --help | help] [bazel options ...]
-Run the programs in tests/, and on Linux, also build the programs in apps/.
+
+Invokes Bazel to run the programs in tests/, and on Linux, also build the
+sample programs in apps/.
 
 If the first option on the command line is -h, --help, or help, this help text
 will be printed and the program will exit. Any other options on the command
-line are passed directly to Bazel."
+line are passed directly to Bazel.
+
+This script makes use of the Python package 'py-cpuinfo'."
 
 # Exit early if the user requested help.
 if [[ "$1" == "-h" || "$1" == "--help" || "$1" == "help" ]]; then
@@ -28,6 +32,11 @@ if [[ "$1" == "-h" || "$1" == "--help" || "$1" == "help" ]]; then
     exit 0
 fi
 
+if ! python -m pip show -qq py-cpuinfo 2>/dev/null; then
+    echo "Error: missing 'py-cpuinfo'. Please install dev-requirements.txt." >&2
+    exit 1
+fi
+
 # Look for AVX and SSE in the processor's feature flags.
 declare features=""
 declare filters=""

From 464a2663e999b6328706c00ffb46c49aba855e19 Mon Sep 17 00:00:00 2001
From: Michael Hucka <mhucka@google.com>
Date: Mon, 5 Jan 2026 11:20:03 -0800
Subject: [PATCH 16/33] In tests/Makefile, send grep output to /dev/null (#988)

On systems where there is no /proc/cpuinfo, you get an error message. We
don't need to see it; we just need the result of the grep.
---
 tests/Makefile | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/Makefile b/tests/Makefile
index 46f35e492..4fef3e3fe 100644
--- a/tests/Makefile
+++ b/tests/Makefile
@@ -13,9 +13,9 @@
 # limitations under the License.
 
 # Determine the hardware features available in this CPU.
-HAVE_SSE := $(shell grep -q sse /proc/cpuinfo && echo "true")
-HAVE_AVX2 := $(shell grep -q avx2 /proc/cpuinfo && echo "true")
-HAVE_AVX512 := $(shell grep -q avx512f /proc/cpuinfo && echo "true")
+HAVE_SSE := $(shell grep -qs sse /proc/cpuinfo && echo "true")
+HAVE_AVX2 := $(shell grep -qs avx2 /proc/cpuinfo && echo "true")
+HAVE_AVX512 := $(shell grep -qs avx512f /proc/cpuinfo && echo "true")
 
 # Default targets. Always built.
 BASIC_FILES := $(shell ls *.cc | egrep -v '_avx|_sse')

From 34ccd4f49864c9f039f97a9e308c321f73a75f4b Mon Sep 17 00:00:00 2001
From: mhucka <mhucka@google.com>
Date: Sun, 18 Jan 2026 07:31:00 +0000
Subject: [PATCH 17/33] Merge differences

---
 setup.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index 2ac063ee8..3b4fe7272 100644
--- a/setup.py
+++ b/setup.py
@@ -116,11 +116,14 @@ def build_extension(self, ext):
 
         env = os.environ.copy()
         cxxflags = env.get("CXXFLAGS", "")
-        env["CXXFLAGS"] = f'{cxxflags} -DVERSION_INFO=\\"{__version__}\\"'
+        env["CXXFLAGS"] = (
+            f'{cxxflags} -DVERSION_INFO=\\"{self.distribution.get_version()}\\"'
+        )
         if not os.path.exists(self.build_temp):
             os.makedirs(self.build_temp)
         subprocess.check_call(
-            ["cmake", ext.sourcedir] + cmake_args, cwd=self.build_temp, env=env
+            ["cmake", ext.sourcedir] + cmake_args,
+            cwd=self.build_temp,
         )
         subprocess.check_call(
             ["cmake", "--build", ".", "--verbose"] + build_args,

From f2bdcc60b3e622a761e4cb5703c22a600af113f5 Mon Sep 17 00:00:00 2001
From: Michael Hucka <mhucka@google.com>
Date: Mon, 5 Jan 2026 11:53:37 -0800
Subject: [PATCH 18/33] Set HOMEBREW_NO_AUTO_UPDATE in CI (#973)

Tell Homebrew not to auto-update when running. Auto-updating is not
necessary here and just wastes time when it happens.
---
 .github/workflows/ci.yaml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
index c574625f4..f71a0fe19 100644
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@@ -276,6 +276,8 @@ jobs:
 
       - name: Install LLVM and OpenMP on macOS
         if: startsWith(matrix.os, 'macos')
+        env:
+          HOMEBREW_NO_AUTO_UPDATE: 1
         run: |
           brew install -q libomp llvm@19
           brew unlink libomp

From 22a3eed33bbfa0856393f8a73facb1fb3dd2431f Mon Sep 17 00:00:00 2001
From: Michael Hucka <mhucka@google.com>
Date: Mon, 5 Jan 2026 11:55:28 -0800
Subject: [PATCH 19/33] Add `.gitattributes` file for project (#974)

IMHO it's useful to see Markdown files as one of the types of files
listed on the GitHub front page in the file statistics section,
alongside other files. It gives a sense for the amount of documentation
present in a project.

On the other hand, some configuration files are not useful to count,
such as .md files in a `.gemini/` subdirectory and configuration files
for tools like git.
---
 .gitattributes | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)
 create mode 100644 .gitattributes

diff --git a/.gitattributes b/.gitattributes
new file mode 100644
index 000000000..fe3dad05f
--- /dev/null
+++ b/.gitattributes
@@ -0,0 +1,22 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Include Markdown files in GitHub's file statistics for this repo.
+*.md linguist-detectable
+
+# Exclude config files from GitHub's file statistics. GitHub's Linguist already
+# recognizes most config files as data; this adds some it doesn't.
+.gemini/**             linguist-documentation
+.git-blame-ignore-revs linguist-documentation
+.markdownlintrc        linguist-documentation

From 2037e80e1dfb5c4b1dd6790e2aed0778abdfb437 Mon Sep 17 00:00:00 2001
From: Michael Hucka <mhucka@google.com>
Date: Mon, 5 Jan 2026 11:56:36 -0800
Subject: [PATCH 20/33] Add configuration for isort to `pyproject.toml` (#987)

`isort` is used in `check/format-incremental`, but there is no
configuration for it. This adds a configuration based on what Cirq
currently uses.
---
 pyproject.toml | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/pyproject.toml b/pyproject.toml
index a4f3e4613..bd1c5102c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -150,3 +150,11 @@ skip = "*musllinux*"
 [tool.black]
 target-version = ['py310', 'py311', 'py312', 'py313']
 extend-exclude = 'third_party'
+
+[tool.isort]
+profile = 'black'
+order_by_type = false  # Sort alphabetically, irrespective of case.
+skip_gitignore = true
+combine_as_imports = true
+known_first_party = ["qsimcirq*"]
+extend_skip = ["__init__.py"]

From e0d71a4eeb25b93b5025efc0b6f8f7cb9fa7f44a Mon Sep 17 00:00:00 2001
From: Michael Hucka <mhucka@google.com>
Date: Mon, 5 Jan 2026 23:21:32 -0800
Subject: [PATCH 21/33] Fix #929 by updating Eigen to version 3.4.1 (#934)

The warning described in https://github.com/quantumlib/qsim/issues/929
seems to be the result of using an older version of Eigen. Updating the
version makes the warning go go away.
---
 Makefile  | 10 +++++-----
 WORKSPACE |  5 +++--
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/Makefile b/Makefile
index 4b37317c2..b59563d5d 100644
--- a/Makefile
+++ b/Makefile
@@ -12,8 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# Version info for the copy of Eigen we will download and build locally.
-EIGEN_PREFIX = "3bb6a48d8c171cf20b5f8e48bfb4e424fbd4f79e"
+# If this is changed, updated the value in ./WORKSPACE too.
+EIGEN_COMMIT = "b66188b5dfd147265bfa9ec47595ca0db72d21f5"
 EIGEN_URL = "https://gitlab.com/libeigen/eigen/-/archive/"
 
 # Default build targets. Additional may be added conditionally below.
@@ -197,9 +197,9 @@ check-cuquantum-root-set:
 
 eigen:
 	-rm -rf eigen
-	wget $(EIGEN_URL)/$(EIGEN_PREFIX)/eigen-$(EIGEN_PREFIX).tar.gz
-	tar -xzf eigen-$(EIGEN_PREFIX).tar.gz && mv eigen-$(EIGEN_PREFIX) eigen
-	rm eigen-$(EIGEN_PREFIX).tar.gz
+	wget $(EIGEN_URL)/$(EIGEN_COMMIT)/eigen-$(EIGEN_COMMIT).tar.gz
+	tar -xzf eigen-$(EIGEN_COMMIT).tar.gz && mv eigen-$(EIGEN_COMMIT) eigen
+	rm eigen-$(EIGEN_COMMIT).tar.gz
 
 .PHONY: clean
 clean:
diff --git a/WORKSPACE b/WORKSPACE
index 6f064c52e..0499366fd 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -55,9 +55,10 @@ load("@org_tensorflow//tensorflow:workspace0.bzl", "tf_workspace0")
 
 tf_workspace0()
 
-EIGEN_COMMIT = "d71c30c47858effcbd39967097a2d99ee48db464"  # 3.4.1
+# https://gitlab.com/libeigen/eigen/-/releases/3.4.1
+EIGEN_COMMIT = "b66188b5dfd147265bfa9ec47595ca0db72d21f5"
 
-EIGEN_SHA256 = "f1d28c2205d015490a685b1e5a171c434da87f757746724de3cb85e69621dec2"
+EIGEN_SHA256 = "2c167ff09e88a5261111bc2aa7f18ae2e78d73fd42339387532937b0c2629829"
 
 http_archive(
     name = "eigen",

From 7926ba541dcd2714f6e80776f9d55c92aad0c7a2 Mon Sep 17 00:00:00 2001
From: Michael Hucka <mhucka@google.com>
Date: Tue, 6 Jan 2026 07:15:03 -0800
Subject: [PATCH 22/33] Fix Makefile handling of user CXXFLAGS & also add a
 DEBUG flag (#990)

The way `CXXFLAGS` and other flags were being set meant that users had
to add the default values (`-std=c++17 -fopenmp -O3` etc) if they set
the flags at all, or else lose the defaults. This was error-prone and
suboptimal. The change here makes it so that the user values are
appended at the end of the defaults, so that they can override values.

This PR also adds a DEBUG flag that switches between using `-g -O0` and
the regular `-O3` option. Developers can invoke it using, e.g., `make
DEBUG=1`.
---
 Makefile | 33 ++++++++++++++++++++++++++++-----
 1 file changed, 28 insertions(+), 5 deletions(-)

diff --git a/Makefile b/Makefile
index b59563d5d..187f4f9b0 100644
--- a/Makefile
+++ b/Makefile
@@ -16,11 +16,12 @@
 EIGEN_COMMIT = "b66188b5dfd147265bfa9ec47595ca0db72d21f5"
 EIGEN_URL = "https://gitlab.com/libeigen/eigen/-/archive/"
 
-# Default build targets. Additional may be added conditionally below.
+# Default build targets. Additional ones are added conditionally below.
 TARGETS = qsim
 TESTS = run-cxx-tests
 
 # By default, we also build the pybind11-based Python interface.
+# Can be overriden via env variables or command-line flags
 PYBIND11 ?= true
 
 ifeq ($(PYBIND11), true)
@@ -31,14 +32,36 @@ endif
 # Default options for Pytest (only used if the pybind interface is built).
 PYTESTFLAGS ?= -v
 
-# Default C++ compilers and compiler flags. Can be overriden via env variables.
+# Default compilers and compiler flags.
+# Can be overriden via env variables or command-line flags.
 CXX ?= g++
 NVCC ?= nvcc
 HIPCC ?= hipcc
 
-CXXFLAGS ?= -O3 -std=c++17 -fopenmp -flto=auto
-NVCCFLAGS ?= -O3 --std c++17 -Wno-deprecated-gpu-targets
-HIPCCFLAGS ?= -O3
+BASE_CXXFLAGS := -std=c++17 -fopenmp
+BASE_NVCCFLAGS := -std c++17 -Wno-deprecated-gpu-targets
+BASE_HIPCCFLAGS :=
+
+CXXFLAGS := $(BASE_CXXFLAGS) $(CXXFLAGS)
+NVCCFLAGS := $(BASE_NVCCFLAGS) $(NVCCFLAGS)
+HIPCCFLAGS := $(BASE_HIPCCFLAGS) $(HIPCCFLAGS)
+
+LTO_FLAGS := -flto=auto
+USING_CLANG := $(shell $(CXX) --version | grep -isq clang && echo "true")
+ifeq ($(USING_CLANG),"true")
+	LTO_FLAGS := -flto
+endif
+
+ifdef DEBUG
+    DEBUG_FLAGS := -g -O0
+    CXXFLAGS += $(DEBUG_FLAGS)
+    NVCCFLAGS += $(DEBUG_FLAGS)
+    HIPCCFLAGS += $(DEBUG_FLAGS)
+else
+    CXXFLAGS += -O3 $(LTO_FLAGS)
+    NVCCFLAGS += -O3
+    HIPCCFLAGS += -O3
+endif
 
 # For compatibility with CMake, if $CUDAARCHS is set, use it to set the
 # architecture options to nvcc. Otherwise, default to the "native" option,

From e478fb17b1788d98324228840e979490220abee9 Mon Sep 17 00:00:00 2001
From: Michael Hucka <mhucka@google.com>
Date: Tue, 6 Jan 2026 07:16:00 -0800
Subject: [PATCH 23/33] Fix #892: use -msse4 rather than -msse4.1 in Makefiles
 (#991)

There was an inconsistency in the Makefiles versus `tests/BUILD` and
`tests/make.sh`, in that a couple of the Makewfiles used `-msse4.1`
while the BUILD file and `tests/make.sh` used `-msse4`. In addition, it
seems that `-msse4` subsumes `-msse4.1`, and so overall, it appears
better to use `-msse4`.

Fixes #892.
---
 pybind_interface/Makefile           | 2 +-
 pybind_interface/sse/CMakeLists.txt | 2 +-
 tests/Makefile                      | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/pybind_interface/Makefile b/pybind_interface/Makefile
index f9693c82c..2c30f678e 100644
--- a/pybind_interface/Makefile
+++ b/pybind_interface/Makefile
@@ -41,7 +41,7 @@ endif
 
 # The flags for the compilation of the simd-specific Pybind11 interfaces
 PYBINDFLAGS_BASIC = $(PYBINDFLAGS)
-PYBINDFLAGS_SSE = -msse4.1 $(PYBINDFLAGS)
+PYBINDFLAGS_SSE = -msse4 $(PYBINDFLAGS)
 PYBINDFLAGS_AVX2 = -mavx2 -mfma $(PYBINDFLAGS)
 PYBINDFLAGS_AVX512 = -mavx512f -mbmi2 $(PYBINDFLAGS)
 
diff --git a/pybind_interface/sse/CMakeLists.txt b/pybind_interface/sse/CMakeLists.txt
index fee561006..195af28c3 100644
--- a/pybind_interface/sse/CMakeLists.txt
+++ b/pybind_interface/sse/CMakeLists.txt
@@ -21,7 +21,7 @@ if(WIN32)
     # This prevents a conflict with /RTC1 in DEBUG builds.
     add_compile_options($<$<NOT:$<CONFIG:Debug>>:/O2>)
 else()
-    add_compile_options(-msse4.1 -O3 -flto=auto)
+    add_compile_options(-msse4 -O3 -flto=auto)
 endif()
 
 if(APPLE)
diff --git a/tests/Makefile b/tests/Makefile
index 4fef3e3fe..6b5eee410 100644
--- a/tests/Makefile
+++ b/tests/Makefile
@@ -25,7 +25,7 @@ SSE_FILES =
 AVX2_FILES =
 AVX512_FILES =
 ifneq (,$(HAVE_SSE))
-    SSE_FLAGS ?= -msse4.1
+    SSE_FLAGS ?= -msse4
     SSE_FILES := $(wildcard *_sse_test.cc)
 endif
 ifneq (,$(HAVE_AVX2))

From 8dfe69e5deba0914d3e65d7fb477db59dcb8ffcf Mon Sep 17 00:00:00 2001
From: Michael Hucka <mhucka@google.com>
Date: Tue, 6 Jan 2026 10:33:57 -0800
Subject: [PATCH 24/33] Fix #904: test that repo owner is Quantumlib (#968)

This adds checks to the GitHub Actions workflows for the owner being
Quantumlib, so that users who fork the repo will not be surprised by the
workflows running in their forks.
---
 .github/workflows/ci.yaml                | 11 ++++++++++-
 .github/workflows/cirq_compatibility.yml |  1 +
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
index f71a0fe19..365b316a8 100644
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@@ -97,6 +97,7 @@ jobs:
 
   docker-lint:
     name: Dockerfile lint checks
+    if: github.repository_owner == 'quantumlib'
     # ubuntu-slim runners don't have docker installed.
     runs-on: ubuntu-24.04
     timeout-minutes: 15
@@ -119,6 +120,7 @@ jobs:
 
   shell-lint:
     name: Shell script lint checks
+    if: github.repository_owner == 'quantumlib'
     runs-on: ubuntu-slim
     timeout-minutes: 15
     steps:
@@ -141,6 +143,7 @@ jobs:
 
   yaml-lint:
     name: YAML lint checks
+    if: github.repository_owner == 'quantumlib'
     runs-on: ubuntu-slim
     timeout-minutes: 15
     steps:
@@ -165,6 +168,7 @@ jobs:
 
   bazel-lint:
     name: Bazel build lint checks
+    if: github.repository_owner == 'quantumlib'
     runs-on: ubuntu-slim
     timeout-minutes: 15
     steps:
@@ -188,6 +192,7 @@ jobs:
 
   action-lint:
     name: GitHub Actions lint checks
+    if: github.repository_owner == 'quantumlib'
     runs-on: ubuntu-slim
     timeout-minutes: 15
     steps:
@@ -206,6 +211,7 @@ jobs:
 
   library-tests:
     name: Library tests
+    if: github.repository_owner == 'quantumlib'
     needs:
       - action-lint
       - bazel-lint
@@ -307,6 +313,7 @@ jobs:
 
   options-tests:
     name: Options tests
+    if: github.repository_owner == 'quantumlib'
     needs:
       - action-lint
       - bazel-lint
@@ -367,6 +374,7 @@ jobs:
 
   memory-tests:
     name: Malloc/asan/msan tests
+    if: github.repository_owner == 'quantumlib'
     needs:
       - action-lint
       - bazel-lint
@@ -427,6 +435,7 @@ jobs:
 
   docker-tests:
     name: Docker build tests
+    if: github.repository_owner == 'quantumlib'
     needs:
       - action-lint
       - docker-lint
@@ -468,7 +477,7 @@ jobs:
 
   report-results:
     name: CI
-    if: always()
+    if: always() && github.repository_owner == 'quantumlib'
     needs:
       - action-lint
       - bazel-lint
diff --git a/.github/workflows/cirq_compatibility.yml b/.github/workflows/cirq_compatibility.yml
index 18d974bef..12b396a18 100644
--- a/.github/workflows/cirq_compatibility.yml
+++ b/.github/workflows/cirq_compatibility.yml
@@ -35,6 +35,7 @@ concurrency:
 
 jobs:
   test-compatibility:
+    if: github.repository_owner == 'quantumlib'
     name: Test Cirq compatibility
     runs-on: ubuntu-24.04
     timeout-minutes: 30

From bd95d8691504be9e17aca840455604c706e2729d Mon Sep 17 00:00:00 2001
From: Michael Hucka <mhucka@google.com>
Date: Wed, 14 Jan 2026 09:27:50 -0800
Subject: [PATCH 25/33] Add `-mbmi2` flag when `-mavx2` is used in Makefiles
 (#992)

It looks like it's possible to use the `-mbmi2` flag when `-mavx2` is
used, based on documentation and on testing locally. This was done in
some Makefiles but not consistently. This PR adds the flag to Makefiles
where it was missing, plus also in one BUILD file.
---
 pybind_interface/avx2/CMakeLists.txt |  9 +++++++-
 tests/Makefile                       | 34 ++++++++++++++++++----------
 2 files changed, 30 insertions(+), 13 deletions(-)

diff --git a/pybind_interface/avx2/CMakeLists.txt b/pybind_interface/avx2/CMakeLists.txt
index cbd6ea2d3..b8a989d2b 100644
--- a/pybind_interface/avx2/CMakeLists.txt
+++ b/pybind_interface/avx2/CMakeLists.txt
@@ -20,8 +20,15 @@ if(WIN32)
     # Add /O2 to any configuration that is NOT Debug.
     # This prevents a conflict with /RTC1 in DEBUG builds.
     add_compile_options($<$<NOT:$<CONFIG:Debug>>:/O2>)
-else()
+elseif(LINUX)
     add_compile_options(-mavx2 -mfma -O3 -flto=auto)
+    execute_process(
+        COMMAND bash --noprofile -c "grep -qs bmi2 /proc/cpuinfo"
+        RESULT_VARIABLE _EXIT_CODE
+    )
+    if(_EXIT_CODE EQUAL 0)
+        add_compile_options("-mbmi2")
+    endif()
 endif()
 
 if(APPLE)
diff --git a/tests/Makefile b/tests/Makefile
index 6b5eee410..c02f470fd 100644
--- a/tests/Makefile
+++ b/tests/Makefile
@@ -13,33 +13,34 @@
 # limitations under the License.
 
 # Determine the hardware features available in this CPU.
-HAVE_SSE := $(shell grep -qs sse /proc/cpuinfo && echo "true")
 HAVE_AVX2 := $(shell grep -qs avx2 /proc/cpuinfo && echo "true")
 HAVE_AVX512 := $(shell grep -qs avx512f /proc/cpuinfo && echo "true")
+HAVE_BMI2 := $(shell grep -qs bmi2 /proc/cpuinfo && echo "true")
+HAVE_SSE := $(shell grep -qs sse /proc/cpuinfo && echo "true")
 
 # Default targets. Always built.
 BASIC_FILES := $(shell ls *.cc | egrep -v '_avx|_sse')
 
 # Additional flags and targets for non-CUDA cases.
-SSE_FILES =
-AVX2_FILES =
-AVX512_FILES =
-ifneq (,$(HAVE_SSE))
-    SSE_FLAGS ?= -msse4
-    SSE_FILES := $(wildcard *_sse_test.cc)
-endif
 ifneq (,$(HAVE_AVX2))
     AVX2_FLAGS ?= -mavx2 -mfma
     AVX2_FILES := $(wildcard *_avx_test.cc)
 endif
 ifneq (,$(HAVE_AVX512))
-    AVX512_FLAGS ?= -mavx512f -mbmi2
+    AVX512_FLAGS ?= -mavx512f
     AVX512_FILES := $(wildcard *_avx512_test.cc)
 endif
+ifneq (,$(HAVE_BMI2))
+    BMI2_FLAGS ?= -mbmi2
+endif
+ifneq (,$(HAVE_SSE))
+    SSE_FLAGS ?= -msse4
+    SSE_FILES := $(wildcard *_sse_test.cc)
+endif
 
 CXX_FILES := $(BASIC_FILES) $(SSE_FILES) $(AVX2_FILES) $(AVX512_FILES)
 CXX_TARGETS := $(CXX_FILES:%.cc=%.x)
-CXXFLAGS := $(CXXFLAGS) $(SSE_FLAGS) $(AVX2_FLAGS) $(AVX512_FLAGS)
+CXXFLAGS := $(CXXFLAGS) $(SSE_FLAGS) $(AVX2_FLAGS) $(AVX512_FLAGS) $(BMI2_FLAGS)
 
 CUDA_FILES := $(wildcard *cuda_test.cu)
 CUDA_TARGETS := $(CUDA_FILES:%cuda_test.cu=%cuda_test.x)
@@ -125,8 +126,17 @@ clean:
 	-rm -f ./*.x ./*.a ./*.so ./*.mod
 	rm -rf $(GTEST_DIR)/build
 
-LOCAL_VARS = HAVE_SSE HAVE_AVX2 HAVE_AVX512 SSE_FLAGS AVX2_FLAGS $\
-	AVX512_FLAGS CXXFLAGS CXX_TARGETS TEST_FLAGS
+LOCAL_VARS = BASIC_FILES CXX_FILES CXX_TARGETS CXXFLAGS $\
+    CUDA_FILES CUDA_TARGETS $\
+    CUSTATEVEC_FILES CUSTATEVEC_FLAGS $\
+    CUSTATEVECEX_FILES CUSTATEVECEX_FLAGS $\
+	HAVE_AVX2 AVX2_FILES AVX2_FLAGS $\
+	HAVE_AVX512 AVX512_FILES AVX512_FLAGS $\
+    HAVE_BMI2 BMI2_FLAGS $\
+    HAVE_SSE SSE_FILES SSE_FLAGS $\
+	HIP_FILES HIP_TARGETS $\
+    GMOCK_DIR GTEST_DIR $\
+    TESTFLAGS
 
 .PHONY: print-vars
 print-vars: ; @$(foreach n,$(sort $(LOCAL_VARS)),echo $n=$($n);)

From 603288098ec33f52f1ba7214834b135fc504c127 Mon Sep 17 00:00:00 2001
From: Michael Hucka <mhucka@google.com>
Date: Thu, 15 Jan 2026 07:21:45 -0800
Subject: [PATCH 26/33] In `setup.py`, don't limit CMake version test to only
 Windows (#995)

The `CMakeBuild` class' `run()` method tests the version of CMake;
however, it did the test only when on Windows. It seems like this is a
test worth doing everywhere, so I removed test for Windows.

In addition, while at it, I slightly updated the way the process output
is captured, and expanded the range of exceptions tested in order to
provide more specific feedback to users.

---------

Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 setup.py | 31 +++++++++++++++++++++----------
 1 file changed, 21 insertions(+), 10 deletions(-)

diff --git a/setup.py b/setup.py
index 3b4fe7272..84adc51a8 100644
--- a/setup.py
+++ b/setup.py
@@ -38,21 +38,32 @@ def __init__(self, name, sourcedir=""):
 class CMakeBuild(build_ext):
     def run(self):
         try:
-            out = subprocess.check_output(["cmake", "--version"])
-        except OSError:
+            out = subprocess.check_output(["cmake", "--version"], text=True, timeout=15)
+
+            from packaging.version import parse
+
+            cmake_version = parse(re.search(r"version\s*([\d.]+)", out).group(1))
+            if cmake_version < parse("3.28.0"):
+                raise RuntimeError(
+                    f"CMake reports its version is {cmake_version}, but qsim needs "
+                    "version >= 3.28.0."
+                )
+        except FileNotFoundError:
             raise RuntimeError(
                 "CMake must be installed to build the following extensions: "
                 + ", ".join(e.name for e in self.extensions)
             )
-
-        if platform.system() == "Windows":
-            from packaging.version import parse
-
-            cmake_version = parse(
-                re.search(r"version\s*([\d.]+)", out.decode()).group(1)
+        except subprocess.CalledProcessError as e:
+            raise RuntimeError(
+                f"Command '{e.cmd}' returned status {e.returncode}. "
+                f"Output: {e.output}"
+            )
+        except subprocess.TimeoutExpired as e:
+            raise RuntimeError(f"Command timed out: {e}")
+        except OSError as e:
+            raise RuntimeError(
+                f"An OS error occurred when trying to run 'cmake --version': {e}"
             )
-            if cmake_version < parse("3.28.0"):
-                raise RuntimeError("CMake >= 3.28.0 is required on Windows")
 
         for ext in self.extensions:
             self.build_extension(ext)

From 9d25d30023fca0a9b007e41724dbc68efcb924d4 Mon Sep 17 00:00:00 2001
From: mhucka <mhucka@google.com>
Date: Sat, 17 Jan 2026 00:12:50 +0000
Subject: [PATCH 27/33] Do pip install requirements & group dev in one command

---
 .github/workflows/ci.yaml                | 16 ++++------------
 .github/workflows/cirq_compatibility.yml |  4 +---
 2 files changed, 5 insertions(+), 15 deletions(-)

diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
index 365b316a8..53b1894f8 100644
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@@ -79,9 +79,7 @@ jobs:
         run: python -m pip install --upgrade pip
 
       - name: Install dependencies
-        run: |
-          pip install -r requirements.txt
-          pip install --group dev
+        run: pip install -r requirements.txt --group dev
 
       - name: Check format
         continue-on-error: ${{inputs.soft-linting == 'true'}}
@@ -254,9 +252,7 @@ jobs:
         run: python -m pip install --upgrade pip
 
       - name: Install dependencies
-        run: |
-          pip install -r requirements.txt
-          pip install --group dev
+        run: pip install -r requirements.txt --group dev
 
       - name: Set up Bazel
         uses: './.github/actions/set-up-bazel'
@@ -347,9 +343,7 @@ jobs:
         run: python -m pip install --upgrade pip
 
       - name: Install dependencies
-        run: |
-          pip install -r requirements.txt
-          pip install --group dev
+        run: pip install -r requirements.txt --group dev
 
       - name: Set up Bazel
         uses: './.github/actions/set-up-bazel'
@@ -408,9 +402,7 @@ jobs:
         run: python -m pip install --upgrade pip
 
       - name: Install dependencies
-        run: |
-          pip install -r requirements.txt
-          pip install --group dev
+        run: pip install -r requirements.txt --group dev
 
       - name: Set up Bazel
         uses: './.github/actions/set-up-bazel'
diff --git a/.github/workflows/cirq_compatibility.yml b/.github/workflows/cirq_compatibility.yml
index 12b396a18..41830adc6 100644
--- a/.github/workflows/cirq_compatibility.yml
+++ b/.github/workflows/cirq_compatibility.yml
@@ -59,9 +59,7 @@ jobs:
         run: pip install --upgrade cirq~=1.0.dev
 
       - name: Install qsim dev requirements
-        run: |
-          pip install -r requirements.txt
-          pip install --group dev
+        run: pip install -r requirements.txt --group dev
 
       - name: Run Python tests
         env:

From 8f66380c21f828658299e2f06093cbf94d63e479 Mon Sep 17 00:00:00 2001
From: mhucka <mhucka@google.com>
Date: Sun, 18 Jan 2026 06:46:58 +0000
Subject: [PATCH 28/33] Move cmake & pybind from requirements.txt to
 pyproject.toml

As pointed out by Pavol in the review comments, CMake should only be
needed for building qsim, and not a run-time installation dependency.
---
 pyproject.toml   | 33 +++++++++++++++++++--------------
 requirements.txt |  9 ++-------
 2 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index bd1c5102c..c1f85c3ed 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -14,14 +14,13 @@
 
 # Note: there are altogether 3 types of dependencies listed in this file:
 #
-#  [build-system].requires: the packages needed for the build system. This list
-#  is not stored in the package metadata.
+#  [build-system].requires: the packages needed by the build backend to build
+#  the project from source. This list is not stored in the package metadata.
 #
-#  [project].dependencies: other packages are minimally needed to be able to
-#  install and run qsimcirq. These are things like Cirq, NumPy, etc. Equivalent
-#  to "install_requires" in setuptools' setup.py. The list gets stored in the
-#  metadata of the package; when the project is installed by pip, this is the
-#  specification that is used to install its dependencies.
+#  [project].dependencies: core packages needed to be able to run qsimcirq.
+#  Equivalent to "install_requires" in setuptools' setup.py. The list is stored
+#  in the metadata of the package; when the project is installed by pip, this is
+#  the specification that is used to install its dependencies.
 #
 #  [dependency-groups].dev: the development dependencies; i.e., what a
 #  developer needs in order to run unit tests, linters, and formatters. The
@@ -32,8 +31,10 @@
 [build-system]
 build-backend = "setuptools.build_meta"
 requires = [
-    "setuptools>=78.1.1",
+    "cmake~=3.28.1",
+    "pybind11[global]",
     "setuptools-scm[toml]>=6.2",
+    "setuptools>=78.1.1",
     "wheel",
 ]
 
@@ -69,12 +70,10 @@ classifiers = [
     "Typing :: Typed",
 ]
 keywords = [
-    "algorithms",
     "cirq",
     "nisq",
     "quantum algorithm development",
     "quantum circuit simulator",
-    "quantum computer simulator",
     "quantum computing",
     "quantum programming",
     "quantum simulation",
@@ -94,11 +93,17 @@ source = "https://github.com/quantumlib/qsim"
 [dependency-groups]
 # Development dependencies. Install these with "pip install --group dev".
 dev = [
-    "black~=25.9.0",
+    # The following repeats [build-system].requires b/c pyproject.toml has no
+    # mechanism to reference that list. Keep these versions in sync with above.
+    "cmake~=3.28.1",
+    "pybind11[global]",
+    "setuptools>=78.1.1; python_version >= '3.12'",
+
+    # Other build, packaging, and distribution utilities.
     "cibuildwheel",
-    # Distutils was removed from Python in 3.12.
-    "setuptools; python_version >= '3.12'",
-    "flynt~=1.0",
+
+    # Linters, formatters, and test utilities.
+    "black~=25.9.0",
     "isort[colors]~=6.0.1",
     "py-cpuinfo",
     "pylint~=4.0.2",
diff --git a/requirements.txt b/requirements.txt
index a927b3363..40ff6bc94 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,16 +1,11 @@
-# Installation and run-time dependencies for qsimcirq. This file is read
-# by pyproject.toml.
+# Run-time dependencies for qsimcirq. This file is read from pyproject.toml.
 
+# Core dependencies:
 absl-py
 cirq-core~=1.0
 numpy>=1.26.0,<2.0; python_version < '3.11'
 numpy>=2.0; python_version >= '3.11'
 
-# These are needed because installing qsimcirq in some environments may require
-# pip to compile Pybind for that specific platform:
-cmake~=3.28.1
-pybind11[global]
-
 # These are transitive dependencies we need to constrain to avoid unresolvable
 # installation conflicts due to them requiring higher Python versions:
 scipy<1.16; python_version < '3.11'

From 238b1710627f6453ebbe7b644f405e5199d282ef Mon Sep 17 00:00:00 2001
From: mhucka <mhucka@google.com>
Date: Sun, 18 Jan 2026 06:47:12 +0000
Subject: [PATCH 29/33] Remove mention of dev-requirements.txt

---
 dev_tools/test_libs.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dev_tools/test_libs.sh b/dev_tools/test_libs.sh
index 7efa58f73..16f4417d1 100755
--- a/dev_tools/test_libs.sh
+++ b/dev_tools/test_libs.sh
@@ -33,7 +33,7 @@ if [[ "$1" == "-h" || "$1" == "--help" || "$1" == "help" ]]; then
 fi
 
 if ! python -m pip show -qq py-cpuinfo 2>/dev/null; then
-    echo "Error: missing 'py-cpuinfo'. Please install dev-requirements.txt." >&2
+    echo "Error: missing package 'py-cpuinfo'." >&2
     exit 1
 fi
 

From 2f9320f53588c0107613c39f044b6c75e239394f Mon Sep 17 00:00:00 2001
From: mhucka <mhucka@google.com>
Date: Sun, 18 Jan 2026 06:49:53 +0000
Subject: [PATCH 30/33] Adjust `Dockerfile`s for changes in requirements scheme

Move the installation of the dev dependencies to the top-level
Dockerfile because things fail to build otherwise.
---
 Dockerfile                  | 2 +-
 pybind_interface/Dockerfile | 4 ----
 2 files changed, 1 insertion(+), 5 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 100c68e92..1e6ae80a9 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -52,7 +52,7 @@ ENV PATH="/test_env/bin:$PATH"
 # Install qsim requirements.
 # hadolint ignore=DL3013
 RUN python3 -m pip install --no-cache-dir --upgrade pip && \
-    python3 -m pip install --no-cache-dir -r requirements.txt
+    python3 -m pip install --no-cache-dir -r requirements.txt --group dev
 
 # Compile qsim.
 RUN make -j qsim
diff --git a/pybind_interface/Dockerfile b/pybind_interface/Dockerfile
index 2b1cb4fd0..54a4ff511 100644
--- a/pybind_interface/Dockerfile
+++ b/pybind_interface/Dockerfile
@@ -26,9 +26,5 @@ WORKDIR /qsim/
 # Build pybind code early to cache the results
 RUN make -j -C /qsim/ pybind
 
-# Install Python development dependencies.
-# hadolint ignore=DL3013
-RUN pip install --no-cache-dir --group dev
-
 # Compile and run qsim tests
 ENTRYPOINT ["make", "-C", "/qsim/", "run-py-tests"]

From e2682a97a7a02b21379378f1faaa330907273621 Mon Sep 17 00:00:00 2001
From: mhucka <mhucka@google.com>
Date: Sun, 18 Jan 2026 07:02:54 +0000
Subject: [PATCH 31/33] Don't need update pip after venv creation

As noted by Pavol in
https://github.com/quantumlib/qsim/pull/985/changes#r2678037600, it's
not necessary to update pip when `--upgrade-deps` is used in the venv creation.
---
 Dockerfile | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 1e6ae80a9..8a3642cb9 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -50,9 +50,7 @@ WORKDIR /qsim/
 ENV PATH="/test_env/bin:$PATH"
 
 # Install qsim requirements.
-# hadolint ignore=DL3013
-RUN python3 -m pip install --no-cache-dir --upgrade pip && \
-    python3 -m pip install --no-cache-dir -r requirements.txt --group dev
+RUN python3 -m pip install --no-cache-dir -r requirements.txt --group dev
 
 # Compile qsim.
 RUN make -j qsim

From 996f6f914e17c7e484cb08d53aa62937528326c4 Mon Sep 17 00:00:00 2001
From: mhucka <mhucka@google.com>
Date: Sun, 18 Jan 2026 07:12:37 +0000
Subject: [PATCH 32/33] Remove unnecessary venv activation

As pointed out by Pavol in review comments, the venv activation command
was pointless.
---
 Dockerfile | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 8a3642cb9..366ffbee1 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -41,8 +41,7 @@ COPY ./requirements.txt /qsim/requirements.txt
 COPY ./pyproject.toml /qsim/pyproject.toml
 
 # Create venv to avoid collision between system packages and what we install.
-RUN python3 -m venv --upgrade-deps test_env && \
-    . test_env/bin/activate
+RUN python3 -m venv --upgrade-deps test_env
 
 WORKDIR /qsim/
 

From 74e6c5813626ea23f5515927dbbaa90f4e4654cb Mon Sep 17 00:00:00 2001
From: mhucka <mhucka@google.com>
Date: Sun, 18 Jan 2026 07:14:40 +0000
Subject: [PATCH 33/33] Rearrange order of commands in Dockerfile

Setting the workdir makes more sense to do before creating the venv.
---
 Dockerfile | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 366ffbee1..87155d2f6 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -40,13 +40,13 @@ COPY ./qsimcirq_tests/ /qsim/qsimcirq_tests/
 COPY ./requirements.txt /qsim/requirements.txt
 COPY ./pyproject.toml /qsim/pyproject.toml
 
+WORKDIR /qsim/
+
 # Create venv to avoid collision between system packages and what we install.
 RUN python3 -m venv --upgrade-deps test_env
 
-WORKDIR /qsim/
-
 # Activate venv.
-ENV PATH="/test_env/bin:$PATH"
+ENV PATH="/qsim/test_env/bin:$PATH"
 
 # Install qsim requirements.
 RUN python3 -m pip install --no-cache-dir -r requirements.txt --group dev