From 3b840e5e73c309f77734b093ebc8067901076c21 Mon Sep 17 00:00:00 2001
From: Alex Hocks <alex.hocks@tum.de>
Date: Fri, 2 Jan 2026 11:47:52 +0100
Subject: [PATCH 01/14] support large strain

---
 pyfans/micro.cpp | 72 +++++++++++++++++++++++++++++++-----------------
 pyfans/micro.hpp |  7 +++--
 2 files changed, 52 insertions(+), 27 deletions(-)
diff --git a/pyfans/micro.cpp b/pyfans/micro.cpp
index b9f3636..bf7d4a8 100644
--- a/pyfans/micro.cpp
+++ b/pyfans/micro.cpp
@@ -31,14 +31,21 @@ MicroSimulation::MicroSimulation(int sim_id, char *input_file)
 
     // Input file name is hardcoded. TODO: Make it configurable
     reader.ReadInputFile(input_file);
-
     reader.ReadMS(3);
-    matmanager = createMaterialManager<3, 6>(reader);
-    solver     = createSolver<3, 6>(reader, matmanager);
+
+    if (reader.strain_type == "small") {
+        matmanager = createMaterialManager<3, 6>(reader);
+        solver     = createSolver<3, 6>(reader, std::get<MaterialManager<3, 6>*>(matmanager));
+    }
+    else {
+        matmanager = createMaterialManager<3, 9>(reader);
+        solver     = createSolver<3, 9>(reader, std::get<MaterialManager<3, 9>*>(matmanager));
+    }
 }
 
 py::dict MicroSimulation::solve(py::dict macro_data, double dt)
 {
+    const bool is_small_strain = std::holds_alternative<MaterialManager<3, 6>*>(matmanager);
     // Time step value dt is not used currently, but is available for future use
 
     // Create a pybind style Numpy array from macro_write_data["micro_vector_data"], which is a Numpy array
@@ -46,40 +53,55 @@ py::dict MicroSimulation::solve(py::dict macro_data, double dt)
     py::array_t<double> strain2 = macro_data["strains4to6"].cast<py::array_t<double>>();
 
     py::array_t<double> strain = merge_arrays(strain1, strain2);
+    if (not is_small_strain) {
+        py::array_t<double> strain3 = macro_data["strains7to9"].cast<py::array_t<double>>();
+        strain = merge_arrays(strain, strain3);
+    }
+
     std::vector<double> g0     = std::vector<double>(strain.data(), strain.data() + strain.size()); // convert numpy array to std::vector.
 
     VectorXd homogenized_stress;
 
-    matmanager->set_gradient(g0);
+    std::visit([&](auto& mm){mm->set_gradient(g0);}, matmanager);
 
-    solver->solve();
+    std::visit([](auto& s){s->solve();}, solver);
 
-    homogenized_stress = solver->get_homogenized_stress();
+    homogenized_stress = std::visit([](auto& s) -> VectorXd { return s->get_homogenized_stress();}, solver);
 
-    auto C = solver->get_homogenized_tangent(pert_param);
+    auto C = std::visit([&](auto& s) -> MatrixXd {return s->get_homogenized_tangent(pert_param);}, solver);
 
     // Convert data to a py::dict again to send it back to the Micro Manager
     py::dict micro_write_data;
 
     // Add stress and stiffness matrix data to Python dict to be returned
-    std::vector<double> stress13     = {homogenized_stress[0], homogenized_stress[1], homogenized_stress[2]};
-    micro_write_data["stresses1to3"] = stress13;
-    std::vector<double> stress46     = {homogenized_stress[3], homogenized_stress[4], homogenized_stress[5]};
-    micro_write_data["stresses4to6"] = stress46;
-    std::vector<double> C_1          = {C(0, 0), C(0, 1), C(0, 2)};
-    micro_write_data["cmat1"]        = C_1;
-    std::vector<double> C_2          = {C(0, 3), C(0, 4), C(0, 5)};
-    micro_write_data["cmat2"]        = C_2;
-    std::vector<double> C_3          = {C(1, 1), C(1, 2), C(1, 3)};
-    micro_write_data["cmat3"]        = C_3;
-    std::vector<double> C_4          = {C(1, 4), C(1, 5), C(2, 2)};
-    micro_write_data["cmat4"]        = C_4;
-    std::vector<double> C_5          = {C(2, 3), C(2, 4), C(2, 5)};
-    micro_write_data["cmat5"]        = C_5;
-    std::vector<double> C_6          = {C(3, 3), C(3, 4), C(3, 5)};
-    micro_write_data["cmat6"]        = C_6;
-    std::vector<double> C_7          = {C(4, 4), C(4, 5), C(5, 5)};
-    micro_write_data["cmat7"]        = C_7;
+    if (is_small_strain) {
+        std::vector<double> stress13     = {homogenized_stress[0], homogenized_stress[1], homogenized_stress[2]};
+        micro_write_data["stresses1to3"] = stress13;
+        std::vector<double> stress46     = {homogenized_stress[3], homogenized_stress[4], homogenized_stress[5]};
+        micro_write_data["stresses4to6"] = stress46;
+        std::vector<double> C_1          = {C(0, 0), C(0, 1), C(0, 2)};
+        micro_write_data["cmat1"]        = C_1;
+        std::vector<double> C_2          = {C(0, 3), C(0, 4), C(0, 5)};
+        micro_write_data["cmat2"]        = C_2;
+        std::vector<double> C_3          = {C(1, 1), C(1, 2), C(1, 3)};
+        micro_write_data["cmat3"]        = C_3;
+        std::vector<double> C_4          = {C(1, 4), C(1, 5), C(2, 2)};
+        micro_write_data["cmat4"]        = C_4;
+        std::vector<double> C_5          = {C(2, 3), C(2, 4), C(2, 5)};
+        micro_write_data["cmat5"]        = C_5;
+        std::vector<double> C_6          = {C(3, 3), C(3, 4), C(3, 5)};
+        micro_write_data["cmat6"]        = C_6;
+        std::vector<double> C_7          = {C(4, 4), C(4, 5), C(5, 5)};
+        micro_write_data["cmat7"]        = C_7;
+    }
+    else {
+        std::vector<double> stress13     = {homogenized_stress[0], homogenized_stress[1], homogenized_stress[2]};
+        micro_write_data["stresses1to3"] = stress13;
+        std::vector<double> stress46     = {homogenized_stress[3], homogenized_stress[4], homogenized_stress[5]};
+        micro_write_data["stresses4to6"] = stress46;
+        std::vector<double> stress79     = {homogenized_stress[6], homogenized_stress[7], homogenized_stress[8]};
+        micro_write_data["stresses7to9"] = stress79;
+    }
 
     return micro_write_data;
 }
diff --git a/pyfans/micro.hpp b/pyfans/micro.hpp
index b6ebc6c..7eafc2a 100644
--- a/pyfans/micro.hpp
+++ b/pyfans/micro.hpp
@@ -5,6 +5,7 @@
 #pragma once
 #include <iostream>
 #include <vector>
+#include <variant>
 
 #include "pybind11/pybind11.h"
 #include "pybind11/numpy.h" // numpy arrays
@@ -26,7 +27,9 @@ class MicroSimulation {
     int    _sim_id;
     Reader reader;
     // Hardcoding mechanical models because these definitions need information from the input file.
-    MaterialManager<3, 6> *matmanager;
-    Solver<3, 6>          *solver;
+    using matmanager_t = std::variant<MaterialManager<3, 6>*, MaterialManager<3, 9>*>;
+    using solver_t     = std::variant<Solver<3, 6>*, Solver<3, 9>*>;
+    matmanager_t          matmanager;
+    solver_t              solver;
     double                 pert_param = 1e-6; // scalar strain perturbation parameter
 };

From c77b6d575efaddede4f951e49c7474f6c10d1846 Mon Sep 17 00:00:00 2001
From: Alex Hocks <alex.hocks@tum.de>
Date: Fri, 2 Jan 2026 11:52:07 +0100
Subject: [PATCH 02/14] add changelog

---
 CHANGELOG.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 54877af..414e660 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,6 +2,7 @@
 
 ## v0.6.1
 
+- Add pyFANS support for large strain [#124](https://github.com/DataAnalyticsEngineering/FANS/pull/124)
 - Adds support to write integration point data to disk [#117](https://github.com/DataAnalyticsEngineering/FANS/pull/117)
 
 ## v0.6.0

From dd8690c48456ac08b754161aa5928d958be29f8a Mon Sep 17 00:00:00 2001
From: Alex Hocks <alex.hocks@tum.de>
Date: Fri, 2 Jan 2026 12:26:34 +0100
Subject: [PATCH 03/14] fix lock file

---
 pixi.lock | 16 ++--------------
 1 file changed, 2 insertions(+), 14 deletions(-)

diff --git a/pixi.lock b/pixi.lock
index a1b4345..86abe46 100644
--- a/pixi.lock
+++ b/pixi.lock
@@ -3,8 +3,6 @@ environments:
   dashboard:
     channels:
     - url: https://conda.anaconda.org/conda-forge/
-    options:
-      pypi-prerelease-mode: if-necessary-or-explicit
     packages:
       linux-64:
       - conda: https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2
@@ -1456,8 +1454,6 @@ environments:
   default:
     channels:
     - url: https://conda.anaconda.org/conda-forge/
-    options:
-      pypi-prerelease-mode: if-necessary-or-explicit
     packages:
       linux-64:
       - conda: https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2
@@ -2951,8 +2947,6 @@ environments:
   dev:
     channels:
     - url: https://conda.anaconda.org/conda-forge/
-    options:
-      pypi-prerelease-mode: if-necessary-or-explicit
     packages:
       linux-64:
       - conda: https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2
@@ -6127,9 +6121,6 @@ packages:
   version: 0.6.1
   build: h1235946_0
   subdir: linux-64
-  variants:
-    cxx_compiler: clangxx
-    target_platform: linux-64
   depends:
   - libstdcxx >=15
   - libgcc >=15
@@ -6137,7 +6128,7 @@ packages:
   - fftw >=3.3.10,<4.0a0
   - fftw * mpi_openmpi_*
   input:
-    hash: 16dd9c1131985cbcdb3bb37c6672fa20ea368188987b8d42da1b2ebd02bd9fbf
+    hash: 81734781fed742a3ee906fd6767759243525440b3f419cd0622b90e4d0f107fe
     globs: []
 - conda: .
   name: fans
@@ -6171,16 +6162,13 @@ packages:
   version: 0.6.1
   build: hcb8d3e5_0
   subdir: osx-arm64
-  variants:
-    cxx_compiler: clangxx
-    target_platform: osx-arm64
   depends:
   - libcxx >=21
   - hdf5 >=1.14.6,<1.14.7.0a0 mpi_openmpi_*
   - fftw >=3.3.10,<4.0a0
   - fftw * mpi_openmpi_*
   input:
-    hash: 16dd9c1131985cbcdb3bb37c6672fa20ea368188987b8d42da1b2ebd02bd9fbf
+    hash: 81734781fed742a3ee906fd6767759243525440b3f419cd0622b90e4d0f107fe
     globs: []
 - conda: https://conda.anaconda.org/conda-forge/linux-64/fftw-3.3.10-mpi_openmpi_h99e62ba_10.conda
   sha256: 59a1fd0daa71bd5529e19b4da8aae2ded4d4ef05e5e31d80c39cbe2fc7664b6a

From b183777b8b90bf0b47de89036617718947f50cfd Mon Sep 17 00:00:00 2001
From: Alex Hocks <alex.hocks@tum.de>
Date: Fri, 2 Jan 2026 17:11:30 +0100
Subject: [PATCH 04/14] fix pyFANS MPI

---
 include/reader.h   | 103 ++++++++++++-----------------
 include/solver.h   | 158 +++++++++++----------------------------------
 include/solverCG.h |   2 +-
 pyfans/micro.cpp   |  10 +++
 src/reader.cpp     |  32 +++++----
 5 files changed, 109 insertions(+), 196 deletions(-)

diff --git a/include/reader.h b/include/reader.h
index 1e62249..be947ae 100644
--- a/include/reader.h
+++ b/include/reader.h
@@ -21,6 +21,10 @@ class Reader {
     // Destructor to free allocated memory
     ~Reader();
 
+    // MPI controlls
+    bool             force_single_rank;
+    MPI_Comm         communicator;
+
     // contents of input file:
     char             ms_filename[4096]{};    // Name of Micro-structure hdf5 file
     char             ms_datasetname[4096]{}; // Absolute path of Micro-structure in hdf5 file
@@ -73,10 +77,10 @@ class Reader {
     void writeData(const char *fieldName, int load_idx, int time_idx, T *data, hsize_t *dims, int ndims);
 
     template <typename T>
-    void writeSlab(const char *fieldName, int load_idx, int time_idx, T *data, const std::vector<int> &extra_dims);
+    void writeSlab(const char *fieldName, int load_idx, int time_idx, T *data, int size);
 
     template <typename T>
-    void WriteSlab(T *data, const std::vector<int> &extra_dims, const char *dset_name);
+    void WriteSlab(T *data, int _howmany, const char *dset_name);
 
     template <typename T>
     void WriteData(T *data, const char *dset_name, hsize_t *dims, int rank);
@@ -157,25 +161,22 @@ void Reader::WriteData(T *data, const char *dset_name, hsize_t *dims, int rank)
 
 // this function has to be here because of the template
 /* ---------------------------------------------------------------------------
- * Write an N-D slab (local_n0 × Ny × Nz × d1 × d2 × ...) from THIS MPI rank
- * into an HDF5 dataset whose global layout on disk is
+ * Write a 4-D slab (local_n0 × Ny × Nz × howmany) from THIS MPI rank into an
+ * HDF5 dataset whose global layout on disk is
  *
- *            Z  Y  X  d1  d2  ...   (i.e. "zyx" + extra dims)
+ *            Z  Y  X  k   (i.e. "zyx" + extra dim k)
  *
  * The caller supplies the data in logical order
- *            X  Y  Z  d1  d2  ... .
+ *            X  Y  Z  k .
  *
- * We therefore transpose the first 3 dimensions once per write call.
+ * We therefore transpose in-memory once per write call.
  * --------------------------------------------------------------------------*/
 template <typename T>
 void Reader::WriteSlab(
-    T                      *data,       // in:  local slab, layout [X][Y][Z][d1][d2]...
-    const std::vector<int> &extra_dims, // sizes of extra dimensions [d1, d2, ...]
-    const char             *dset_name)
+    T          *data,     // in:  local slab, layout [X][Y][Z][k]
+    int         _howmany, // global size of the 4th axis (k)
+    const char *dset_name)
 {
-    if (extra_dims.empty()) {
-        throw std::invalid_argument("WriteSlab: extra_dims cannot be empty");
-    }
     /*------------------------------------------------------------------*/
     /* 0. map C++ type -> native HDF5 type                              */
     /*------------------------------------------------------------------*/
@@ -197,6 +198,7 @@ void Reader::WriteSlab(
     /* 1. open or create the HDF5 file                                  */
     /*------------------------------------------------------------------*/
     hid_t file_id = results_file_id;
+    hid_t plist_id;
     if (file_id < 0) {
         throw std::runtime_error("WriteSlab: results file is not open");
     }
@@ -204,20 +206,15 @@ void Reader::WriteSlab(
     void *old_client_data;
 
     /*------------------------------------------------------------------*/
-    /* 2. create the dataset (global dims = Z Y X d1 d2 ...) if needed  */
+    /* 2. create the dataset (global dims =  Z Y X k ) if necessary     */
     /*------------------------------------------------------------------*/
-    const hsize_t Nx = static_cast<hsize_t>(dims[0]);
-    const hsize_t Ny = static_cast<hsize_t>(dims[1]);
-    const hsize_t Nz = static_cast<hsize_t>(dims[2]);
-
-    const int            rank = 3 + extra_dims.size();
-    std::vector<hsize_t> dimsf(rank);
-    dimsf[0] = Nz;
-    dimsf[1] = Ny;
-    dimsf[2] = Nx;
-    for (size_t i = 0; i < extra_dims.size(); ++i) {
-        dimsf[3 + i] = static_cast<hsize_t>(extra_dims[i]);
-    }
+    const hsize_t Nx   = static_cast<hsize_t>(dims[0]);
+    const hsize_t Ny   = static_cast<hsize_t>(dims[1]);
+    const hsize_t Nz   = static_cast<hsize_t>(dims[2]);
+    const hsize_t kDim = static_cast<hsize_t>(_howmany);
+
+    const int rank        = 4;
+    hsize_t   dimsf[rank] = {Nz, Ny, Nx, kDim}; /* <<< Z Y X k */
 
     safe_create_group(file_id, dset_name);
 
@@ -229,7 +226,7 @@ void Reader::WriteSlab(
     H5Eset_auto(H5E_DEFAULT, old_func, old_client_data); /* restore */
 
     if (dset_id < 0) {
-        hid_t filespace = H5Screate_simple(rank, dimsf.data(), nullptr);
+        hid_t filespace = H5Screate_simple(rank, dimsf, nullptr);
         dset_id         = H5Dcreate2(file_id, dset_name, data_type,
                                      filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
         H5Sclose(filespace);
@@ -252,40 +249,20 @@ void Reader::WriteSlab(
     /*------------------------------------------------------------------*/
     /* 3. build FILE hyperslab  (slice along file-dim 2 = X axis)       */
     /*------------------------------------------------------------------*/
-    std::vector<hsize_t> fcount(rank);
-    std::vector<hsize_t> foffset(rank);
-
-    fcount[0] = Nz;
-    fcount[1] = Ny;
-    fcount[2] = static_cast<hsize_t>(local_n0);
-    for (size_t i = 0; i < extra_dims.size(); ++i) {
-        fcount[3 + i] = static_cast<hsize_t>(extra_dims[i]);
-    }
-
-    foffset[0] = 0;
-    foffset[1] = 0;
-    foffset[2] = static_cast<hsize_t>(local_0_start);
-    for (size_t i = 0; i < extra_dims.size(); ++i) {
-        foffset[3 + i] = 0;
-    }
+    hsize_t fcount[4]  = {Nz, Ny, static_cast<hsize_t>(local_n0), kDim};
+    hsize_t foffset[4] = {0, 0, static_cast<hsize_t>(local_0_start), 0};
 
     hid_t filespace = H5Dget_space(dset_id);
-    H5Sselect_hyperslab(filespace, H5S_SELECT_SET, foffset.data(), nullptr, fcount.data(), nullptr);
+    H5Sselect_hyperslab(filespace, H5S_SELECT_SET, foffset, nullptr, fcount, nullptr);
 
     /*------------------------------------------------------------------*/
-    /* 4. transpose local slab  [X][Y][Z][d1][d2]...  ->  [Z][Y][X][d1][d2]... */
+    /* 4. transpose local slab  [X][Y][Z][k]  ->  [Z][Y][X][k]          */
     /*------------------------------------------------------------------*/
-    const Eigen::Index NxLoc = static_cast<Eigen::Index>(local_n0);
-    const Eigen::Index NyLoc = static_cast<Eigen::Index>(Ny);
-    const Eigen::Index NzLoc = static_cast<Eigen::Index>(Nz);
-
-    // Flatten extra dimensions into one for Eigen Tensor (which supports up to rank 4)
-    size_t extra_size = 1;
-    for (int d : extra_dims)
-        extra_size *= d;
-    const Eigen::Index extraLoc = static_cast<Eigen::Index>(extra_size);
-
-    const size_t slabElems = static_cast<size_t>(NxLoc * NyLoc * NzLoc * extraLoc);
+    const Eigen::Index NxLoc     = static_cast<Eigen::Index>(local_n0);
+    const Eigen::Index NyLoc     = static_cast<Eigen::Index>(Ny);
+    const Eigen::Index NzLoc     = static_cast<Eigen::Index>(Nz);
+    const Eigen::Index kLoc      = static_cast<Eigen::Index>(kDim);
+    const size_t       slabElems = static_cast<size_t>(NxLoc * NyLoc * NzLoc * kLoc);
 
     T *tmp = static_cast<T *>(fftw_malloc(slabElems * sizeof(T)));
     if (!tmp) {
@@ -293,21 +270,21 @@ void Reader::WriteSlab(
     }
 
     Eigen::TensorMap<Eigen::Tensor<const T, 4, Eigen::RowMajor>>
-        input_tensor(data, NxLoc, NyLoc, NzLoc, extraLoc);
+        input_tensor(data, NxLoc, NyLoc, NzLoc, kLoc);
 
     Eigen::TensorMap<Eigen::Tensor<T, 4, Eigen::RowMajor>>
-        output_tensor(tmp, NzLoc, NyLoc, NxLoc, extraLoc);
+        output_tensor(tmp, NzLoc, NyLoc, NxLoc, kLoc);
 
-    // Permute dimensions: (0,1,2,3) -> (2,1,0,3) swaps X and Z, keeps extra dims
+    // Permute dimensions: (0,1,2,3) -> (2,1,0,3) swaps X and Z
     Eigen::array<Eigen::Index, 4> shuffle_dims = {2, 1, 0, 3};
     output_tensor                              = input_tensor.shuffle(shuffle_dims);
 
     /*------------------------------------------------------------------*/
     /* 5. MEMORY dataspace matches FILE slab exactly                    */
     /*------------------------------------------------------------------*/
-    hid_t memspace = H5Screate_simple(rank, fcount.data(), nullptr);
+    hid_t memspace = H5Screate_simple(4, fcount, nullptr);
 
-    hid_t plist_id = H5Pcreate(H5P_DATASET_XFER);
+    plist_id = H5Pcreate(H5P_DATASET_XFER);
     H5Pset_dxpl_mpio(plist_id, H5FD_MPIO_COLLECTIVE);
 
     herr_t status = H5Dwrite(dset_id, data_type,
@@ -337,14 +314,14 @@ void Reader::writeData(const char *fieldName, int load_idx, int time_idx, T *dat
 }
 
 template <typename T>
-void Reader::writeSlab(const char *fieldName, int load_idx, int time_idx, T *data, const std::vector<int> &extra_dims)
+void Reader::writeSlab(const char *fieldName, int load_idx, int time_idx, T *data, int size)
 {
     if (std::find(resultsToWrite.begin(), resultsToWrite.end(), fieldName) == resultsToWrite.end()) {
         return;
     }
     char name[5096];
     snprintf(name, sizeof(name), "%s/load%i/time_step%i/%s", dataset_name, load_idx, time_idx, fieldName);
-    WriteSlab(data, extra_dims, name);
+    WriteSlab(data, size, name);
 }
 
 #endif
diff --git a/include/solver.h b/include/solver.h
index cb05d49..e14e33f 100644
--- a/include/solver.h
+++ b/include/solver.h
@@ -18,6 +18,7 @@ class Solver : private MixedBCController<howmany> {
 
     const int world_rank;
     const int world_size;
+    MPI_Comm communicator;
 
     const ptrdiff_t n_x, n_y, n_z;
     // NOTE: the order in the declaration is very important because it is the same order in which the later initialization via member initializer lists takes place
@@ -108,6 +109,7 @@ Solver<howmany, n_str>::Solver(Reader &reader, MaterialManager<howmany, n_str> *
       matmanager(matmgr),
       world_rank(reader.world_rank),
       world_size(reader.world_size),
+      communicator(reader.communicator),
       n_x(reader.dims[0]),
       n_y(reader.dims[1]),
       n_z(reader.dims[2]),
@@ -218,8 +220,8 @@ void Solver<howmany, n_str>::CreateFFTWPlans(double *in, fftw_complex *transform
     // But, according to https://fftw.org/doc/MPI-Plan-Creation.html the BLOCK sizes must be the same:
     // "These must be the same block sizes as were passed to the corresponding ‘local_size’ function"
     const ptrdiff_t n[3] = {n_x, n_y, n_z};
-    planfft              = fftw_mpi_plan_many_dft_r2c(rank, n, howmany, iblock, oblock, in, transformed, MPI_COMM_WORLD, FFTW_MEASURE | FFTW_MPI_TRANSPOSED_OUT);
-    planifft             = fftw_mpi_plan_many_dft_c2r(rank, n, howmany, iblock, oblock, transformed, out, MPI_COMM_WORLD, FFTW_MEASURE | FFTW_MPI_TRANSPOSED_IN);
+    planfft              = fftw_mpi_plan_many_dft_r2c(rank, n, howmany, iblock, oblock, in, transformed, communicator, FFTW_MEASURE | FFTW_MPI_TRANSPOSED_OUT);
+    planifft             = fftw_mpi_plan_many_dft_c2r(rank, n, howmany, iblock, oblock, transformed, out, communicator, FFTW_MEASURE | FFTW_MPI_TRANSPOSED_IN);
 
     // see https://eigen.tuxfamily.org/dox/group__TutorialMapClass.html#title3
     new (&rhat) Map<VectorXcd>((std::complex<double> *) transformed, local_n1 * n_x * (n_z / 2 + 1) * howmany);
@@ -242,7 +244,7 @@ void Solver<howmany, n_str>::compute_residual_basic(RealArray &r_matrix, RealArr
     // int MPI_Sendrecv(void *sendbuf, int sendcount, MPI_Datatype sendtype, int dest, int sendtag, void *recvbuf,
     //           int recvcount, MPI_Datatype recvtype, int source, int recvtag, MPI_Comm comm, MPI_Status *status)
     MPI_Sendrecv(u, n_y * n_z * howmany, MPI_DOUBLE, (world_rank + world_size - 1) % world_size, 0,
-                 u + local_n0 * n_y * n_z * howmany, n_y * n_z * howmany, MPI_DOUBLE, (world_rank + 1) % world_size, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+                 u + local_n0 * n_y * n_z * howmany, n_y * n_z * howmany, MPI_DOUBLE, (world_rank + 1) % world_size, 0, communicator, MPI_STATUS_IGNORE);
 
     Matrix<double, howmany * 8, 1> ue;
 
@@ -262,7 +264,7 @@ void Solver<howmany, n_str>::compute_residual_basic(RealArray &r_matrix, RealArr
     });
 
     MPI_Sendrecv(r + local_n0 * n_y * (n_z + padding) * howmany, n_y * (n_z + padding) * howmany, MPI_DOUBLE, (world_rank + 1) % world_size, 0,
-                 buffer_padding, n_y * (n_z + padding) * howmany, MPI_DOUBLE, (world_rank + world_size - 1) % world_size, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+                 buffer_padding, n_y * (n_z + padding) * howmany, MPI_DOUBLE, (world_rank + world_size - 1) % world_size, 0, communicator, MPI_STATUS_IGNORE);
 
     RealArray b(buffer_padding, n_z * howmany, n_y, OuterStride<>((n_z + padding) * howmany)); // NOTE: for any padding of more than 2, the buffer_padding has to be extended
 
@@ -427,7 +429,7 @@ double Solver<howmany, n_str>::compute_error(RealArray &r)
     }
 
     double err;
-    MPI_Allreduce(&err_local, &err, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
+    MPI_Allreduce(&err_local, &err, 1, MPI_DOUBLE, MPI_MAX, communicator);
 
     err_all[iter]  = err;
     double err0    = err_all[0];
@@ -454,37 +456,8 @@ double Solver<howmany, n_str>::compute_error(RealArray &r)
 template <int howmany, int n_str>
 void Solver<howmany, n_str>::postprocess(Reader &reader, int load_idx, int time_idx)
 {
-    int n_gp = matmanager->models[0]->n_gp;
-
-    // Check what user requested
-    auto &results        = reader.resultsToWrite;
-    bool  need_stress    = std::find(results.begin(), results.end(), "stress") != results.end();
-    bool  need_stress_gp = std::find(results.begin(), results.end(), "stress_gp") != results.end();
-    bool  need_strain    = std::find(results.begin(), results.end(), "strain") != results.end();
-    bool  need_strain_gp = std::find(results.begin(), results.end(), "strain_gp") != results.end();
-
-    bool need_global_avg = std::find(results.begin(), results.end(), "stress_average") != results.end() ||
-                           std::find(results.begin(), results.end(), "strain_average") != results.end();
-
-    bool need_phase_avg = false;
-    for (int mat_idx = 0; mat_idx < reader.n_mat; ++mat_idx) {
-        char name[512];
-        sprintf(name, "phase_stress_average_phase%d", mat_idx);
-        if (std::find(results.begin(), results.end(), name) != results.end()) {
-            need_phase_avg = true;
-            break;
-        }
-    }
-
-    // Determine if we need to compute stress/strain at all
-    bool need_compute = need_stress || need_stress_gp || need_strain || need_strain_gp || need_global_avg || need_phase_avg;
-
-    // Conditional allocation
-    VectorXd *strain_elem = need_strain ? new VectorXd(local_n0 * n_y * n_z * n_str) : nullptr;
-    VectorXd *stress_elem = need_stress ? new VectorXd(local_n0 * n_y * n_z * n_str) : nullptr;
-    VectorXd *strain_gp   = need_strain_gp ? new VectorXd(local_n0 * n_y * n_z * n_gp * n_str) : nullptr;
-    VectorXd *stress_gp   = need_stress_gp ? new VectorXd(local_n0 * n_y * n_z * n_gp * n_str) : nullptr;
-
+    VectorXd strain         = VectorXd::Zero(local_n0 * n_y * n_z * n_str);
+    VectorXd stress         = VectorXd::Zero(local_n0 * n_y * n_z * n_str);
     VectorXd stress_average = VectorXd::Zero(n_str);
     VectorXd strain_average = VectorXd::Zero(n_str);
 
@@ -495,76 +468,38 @@ void Solver<howmany, n_str>::postprocess(Reader &reader, int load_idx, int time_
     vector<int>      phase_counts(n_mat, 0);
 
     MPI_Sendrecv(v_u, n_y * n_z * howmany, MPI_DOUBLE, (world_rank + world_size - 1) % world_size, 0,
-                 v_u + local_n0 * n_y * n_z * howmany, n_y * n_z * howmany, MPI_DOUBLE, (world_rank + 1) % world_size, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+                 v_u + local_n0 * n_y * n_z * howmany, n_y * n_z * howmany, MPI_DOUBLE, (world_rank + 1) % world_size, 0, communicator, MPI_STATUS_IGNORE);
 
     Matrix<double, howmany * 8, 1> ue;
     int                            phase_id;
-
-    if (need_compute) {
-        iterateCubes<0>([&](ptrdiff_t *idx, ptrdiff_t *idxPadding) {
-            for (int i = 0; i < 8; ++i) {
-                for (int j = 0; j < howmany; ++j) {
-                    ue(howmany * i + j, 0) = v_u[howmany * idx[i] + j];
-                }
-            }
-            phase_id = ms[idx[0]];
-
-            const MaterialInfo<howmany, n_str> &info = matmanager->get_info(phase_id);
-
-            // Temporary storage for element averages
-            double elem_strain_avg[n_str];
-            double elem_stress_avg[n_str];
-
-            // Compute once - populates internal eps/sigma
-            info.model->getStrainStress(elem_strain_avg, elem_stress_avg, ue, info.local_mat_id, idx[0]);
-
-            // Store element averages if requested
-            if (need_stress) {
-                for (int c = 0; c < n_str; ++c) {
-                    (*stress_elem)(idx[0] * n_str + c) = elem_stress_avg[c];
-                }
-            }
-            if (need_strain) {
-                for (int c = 0; c < n_str; ++c) {
-                    (*strain_elem)(idx[0] * n_str + c) = elem_strain_avg[c];
-                }
+    iterateCubes<0>([&](ptrdiff_t *idx, ptrdiff_t *idxPadding) {
+        for (int i = 0; i < 8; ++i) {
+            for (int j = 0; j < howmany; ++j) {
+                ue(howmany * i + j, 0) = v_u[howmany * idx[i] + j];
             }
+        }
+        phase_id = ms[idx[0]];
 
-            // Copy all GP data if requested
-            if (need_stress_gp) {
-                memcpy(&(*stress_gp)(idx[0] * n_gp * n_str),
-                       info.model->get_sigma_data(),
-                       n_gp * n_str * sizeof(double));
-            }
-            if (need_strain_gp) {
-                memcpy(&(*strain_gp)(idx[0] * n_gp * n_str),
-                       info.model->get_eps_data(),
-                       n_gp * n_str * sizeof(double));
-            }
+        const MaterialInfo<howmany, n_str> &info = matmanager->get_info(phase_id);
+        info.model->getStrainStress(strain.segment(n_str * idx[0], n_str).data(), stress.segment(n_str * idx[0], n_str).data(), ue, info.local_mat_id, idx[0]);
+        stress_average += stress.segment(n_str * idx[0], n_str);
+        strain_average += strain.segment(n_str * idx[0], n_str);
 
-            // Accumulate for global and phase averages
-            if (need_global_avg || need_phase_avg) {
-                for (int c = 0; c < n_str; ++c) {
-                    stress_average(c) += elem_stress_avg[c];
-                    strain_average(c) += elem_strain_avg[c];
-                    phase_stress_average[phase_id](c) += elem_stress_avg[c];
-                    phase_strain_average[phase_id](c) += elem_strain_avg[c];
-                }
-                phase_counts[phase_id]++;
-            }
-        });
-    }
+        phase_stress_average[phase_id] += stress.segment(n_str * idx[0], n_str);
+        phase_strain_average[phase_id] += strain.segment(n_str * idx[0], n_str);
+        phase_counts[phase_id]++;
+    });
 
-    MPI_Allreduce(MPI_IN_PLACE, stress_average.data(), n_str, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-    MPI_Allreduce(MPI_IN_PLACE, strain_average.data(), n_str, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+    MPI_Allreduce(MPI_IN_PLACE, stress_average.data(), n_str, MPI_DOUBLE, MPI_SUM, communicator);
+    MPI_Allreduce(MPI_IN_PLACE, strain_average.data(), n_str, MPI_DOUBLE, MPI_SUM, communicator);
     stress_average /= (n_x * n_y * n_z);
     strain_average /= (n_x * n_y * n_z);
 
     // Reduce per-phase accumulations across all processes
     for (int mat_index = 0; mat_index < n_mat; ++mat_index) {
-        MPI_Allreduce(MPI_IN_PLACE, phase_stress_average[mat_index].data(), n_str, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-        MPI_Allreduce(MPI_IN_PLACE, phase_strain_average[mat_index].data(), n_str, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-        MPI_Allreduce(MPI_IN_PLACE, &phase_counts[mat_index], 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
+        MPI_Allreduce(MPI_IN_PLACE, phase_stress_average[mat_index].data(), n_str, MPI_DOUBLE, MPI_SUM, communicator);
+        MPI_Allreduce(MPI_IN_PLACE, phase_strain_average[mat_index].data(), n_str, MPI_DOUBLE, MPI_SUM, communicator);
+        MPI_Allreduce(MPI_IN_PLACE, &phase_counts[mat_index], 1, MPI_INT, MPI_SUM, communicator);
 
         // Compute average for each phase
         if (phase_counts[mat_index] > 0) {
@@ -664,33 +599,16 @@ void Solver<howmany, n_str>::postprocess(Reader &reader, int load_idx, int time_
     reader.writeData("absolute_error", load_idx, time_idx, err_all.data(), dims, 1);
 
     vector<int> rank_field(local_n0 * n_y * n_z, world_rank);
-    reader.writeSlab("mpi_rank", load_idx, time_idx, rank_field.data(), {1});
-    reader.writeSlab("microstructure", load_idx, time_idx, ms, {1});
-    reader.writeSlab("displacement_fluctuation", load_idx, time_idx, v_u, {howmany});
-    reader.writeSlab("displacement", load_idx, time_idx, u_total.data(), {howmany});
-    reader.writeSlab("residual", load_idx, time_idx, v_r, {howmany});
-
-    if (need_strain)
-        reader.writeSlab("strain", load_idx, time_idx, strain_elem->data(), {n_str});
-    if (need_stress)
-        reader.writeSlab("stress", load_idx, time_idx, stress_elem->data(), {n_str});
-    if (need_strain_gp)
-        reader.writeSlab("strain_gp", load_idx, time_idx, strain_gp->data(), {n_gp, n_str});
-    if (need_stress_gp)
-        reader.writeSlab("stress_gp", load_idx, time_idx, stress_gp->data(), {n_gp, n_str});
+    reader.writeSlab("mpi_rank", load_idx, time_idx, rank_field.data(), 1);
+    reader.writeSlab("microstructure", load_idx, time_idx, ms, 1);
+    reader.writeSlab("displacement_fluctuation", load_idx, time_idx, v_u, howmany);
+    reader.writeSlab("displacement", load_idx, time_idx, u_total.data(), howmany);
+    reader.writeSlab("residual", load_idx, time_idx, v_r, howmany);
+    reader.writeSlab("strain", load_idx, time_idx, strain.data(), n_str);
+    reader.writeSlab("stress", load_idx, time_idx, stress.data(), n_str);
 
     matmanager->postprocess(*this, reader, load_idx, time_idx);
 
-    // Cleanup
-    if (strain_elem)
-        delete strain_elem;
-    if (stress_elem)
-        delete stress_elem;
-    if (strain_gp)
-        delete strain_gp;
-    if (stress_gp)
-        delete stress_gp;
-
     // Compute homogenized tangent only if requested
     if (find(reader.resultsToWrite.begin(), reader.resultsToWrite.end(), "homogenized_tangent") != reader.resultsToWrite.end()) {
         homogenized_tangent = get_homogenized_tangent(1e-6);
@@ -714,7 +632,7 @@ VectorXd Solver<howmany, n_str>::get_homogenized_stress()
     homogenized_stress = VectorXd::Zero(n_str);
 
     MPI_Sendrecv(v_u, n_y * n_z * howmany, MPI_DOUBLE, (world_rank + world_size - 1) % world_size, 0,
-                 v_u + local_n0 * n_y * n_z * howmany, n_y * n_z * howmany, MPI_DOUBLE, (world_rank + 1) % world_size, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+                 v_u + local_n0 * n_y * n_z * howmany, n_y * n_z * howmany, MPI_DOUBLE, (world_rank + 1) % world_size, 0, communicator, MPI_STATUS_IGNORE);
 
     Matrix<double, howmany * 8, 1> ue;
     int                            phase_id;
@@ -731,7 +649,7 @@ VectorXd Solver<howmany, n_str>::get_homogenized_stress()
         homogenized_stress += stress.segment(n_str * idx[0], n_str);
     });
 
-    MPI_Allreduce(MPI_IN_PLACE, homogenized_stress.data(), n_str, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+    MPI_Allreduce(MPI_IN_PLACE, homogenized_stress.data(), n_str, MPI_DOUBLE, MPI_SUM, communicator);
     homogenized_stress /= (n_x * n_y * n_z);
 
     return homogenized_stress;
diff --git a/include/solverCG.h b/include/solverCG.h
index b02a428..60bc0d6 100644
--- a/include/solverCG.h
+++ b/include/solverCG.h
@@ -53,7 +53,7 @@ double SolverCG<howmany, n_str>::dotProduct(RealArray &a, RealArray &b)
 {
     double local_value = (a * b).sum();
     double result;
-    MPI_Allreduce(&local_value, &result, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+    MPI_Allreduce(&local_value, &result, 1, MPI_DOUBLE, MPI_SUM, this->communicator);
     return result;
 }
 
diff --git a/pyfans/micro.cpp b/pyfans/micro.cpp
index bf7d4a8..9658119 100644
--- a/pyfans/micro.cpp
+++ b/pyfans/micro.cpp
@@ -6,6 +6,7 @@
 #include "micro.hpp"
 #include "setup.h"
 #include "matmodel.h"
+#include "mpi.h"
 
 py::array_t<double> merge_arrays(py::array_t<double> array1, py::array_t<double> array2)
 {
@@ -31,7 +32,15 @@ MicroSimulation::MicroSimulation(int sim_id, char *input_file)
 
     // Input file name is hardcoded. TODO: Make it configurable
     reader.ReadInputFile(input_file);
+
+    reader.force_single_rank = true;
+    reader.communicator = MPI_COMM_SELF;
+
+    int rank = -1;
+    MPI_Comm_rank(reader.communicator, &rank);
+
     reader.ReadMS(3);
+    printf(">>read input%d id %d\n", rank, sim_id);
 
     if (reader.strain_type == "small") {
         matmanager = createMaterialManager<3, 6>(reader);
@@ -41,6 +50,7 @@ MicroSimulation::MicroSimulation(int sim_id, char *input_file)
         matmanager = createMaterialManager<3, 9>(reader);
         solver     = createSolver<3, 9>(reader, std::get<MaterialManager<3, 9>*>(matmanager));
     }
+    printf(">>constructed %d id %d\n", rank, sim_id);
 }
 
 py::dict MicroSimulation::solve(py::dict macro_data, double dt)
diff --git a/src/reader.cpp b/src/reader.cpp
index 288d4af..542127e 100644
--- a/src/reader.cpp
+++ b/src/reader.cpp
@@ -29,8 +29,8 @@ void Reader::ComputeVolumeFractions()
 
     // Find the global maximum and minimum material indices
     unsigned short global_max, global_min;
-    MPI_Allreduce(&local_max, &global_max, 1, MPI_UNSIGNED_SHORT, MPI_MAX, MPI_COMM_WORLD);
-    MPI_Allreduce(&local_min, &global_min, 1, MPI_UNSIGNED_SHORT, MPI_MIN, MPI_COMM_WORLD);
+    MPI_Allreduce(&local_max, &global_max, 1, MPI_UNSIGNED_SHORT, MPI_MAX, communicator);
+    MPI_Allreduce(&local_min, &global_min, 1, MPI_UNSIGNED_SHORT, MPI_MIN, communicator);
 
     // Calculate total number of materials
     n_mat = global_max - global_min + 1;
@@ -52,7 +52,7 @@ void Reader::ComputeVolumeFractions()
 
     for (int i = 0; i < n_mat; i++) {
         long vf;
-        MPI_Allreduce(&(vol_frac[i]), &vf, 1, MPI_LONG, MPI_SUM, MPI_COMM_WORLD);
+        MPI_Allreduce(&(vol_frac[i]), &vf, 1, MPI_LONG, MPI_SUM, communicator);
         v_frac[i] = double(vf) / double(dims[0] * dims[1] * dims[2]);
         if (world_rank == 0)
             printf("# material %4u    vol. frac. %10.4f%%  \n",
@@ -63,15 +63,23 @@ void Reader::ComputeVolumeFractions()
 void Reader ::ReadInputFile(char input_fn[])
 {
     try {
-
-        MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
-        MPI_Comm_size(MPI_COMM_WORLD, &world_size);
-
         ifstream i(input_fn);
         json     j;
         i >> j;
         inputJson = j; // Store complete input JSON for MaterialManager
 
+        if (j.contains("no_mpi")) {
+            force_single_rank = true;
+            communicator = MPI_COMM_SELF;
+        }
+        else {
+            force_single_rank = false;
+            communicator = MPI_COMM_WORLD;
+        }
+
+        MPI_Comm_rank(communicator, &world_rank);
+        MPI_Comm_size(communicator, &world_size);
+
         microstructure = j["microstructure"];
         strcpy(ms_filename, microstructure["filepath"].get<string>().c_str());
         strcpy(ms_datasetname, microstructure["datasetname"].get<string>().c_str());
@@ -222,8 +230,8 @@ void Reader ::ReadMS(int hm)
     hid_t   plist_id; /* property list identifier */
     herr_t  status;
 
-    MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
-    MPI_Comm_size(MPI_COMM_WORLD, &world_size);
+    MPI_Comm_rank(communicator, &world_rank);
+    MPI_Comm_size(communicator, &world_size);
     MPI_Info info = MPI_INFO_NULL;
 
     // Set up file access property list with parallel I/O access
@@ -310,11 +318,11 @@ void Reader ::ReadMS(int hm)
       ptrdiff_t *local_n1, ptrdiff_t *local_1_start);		\
     */
 
-    alloc_local = fftw_mpi_local_size_many_transposed(rank, n, hm, block0, block1, MPI_COMM_WORLD, &local_n0, &local_0_start, &local_n1, &local_1_start);
+    alloc_local = fftw_mpi_local_size_many_transposed(rank, n, hm, block0, block1, communicator, &local_n0, &local_0_start, &local_n1, &local_1_start);
 
     if (local_n0 < 4)
         throw std::runtime_error("[ FANS3D_Grid ] ERROR: Number of voxels in x-direction is less than 4 in process " + to_string(world_rank));
-    MPI_Barrier(MPI_COMM_WORLD);
+    MPI_Barrier(communicator);
 
     hsize_t fcount[3], foffset[3];
     if (is_zyx) {              /* file layout  Z Y X */
@@ -400,7 +408,7 @@ void Reader::OpenResultsFile(const char *output_fn)
 {
     std::snprintf(results_filename, sizeof(results_filename), "%s", output_fn);
     hid_t plist_id = H5Pcreate(H5P_FILE_ACCESS);
-    H5Pset_fapl_mpio(plist_id, MPI_COMM_WORLD, MPI_INFO_NULL);
+    H5Pset_fapl_mpio(plist_id, communicator, MPI_INFO_NULL);
     results_file_id = H5Fcreate(results_filename, H5F_ACC_TRUNC, H5P_DEFAULT, plist_id);
     H5Pclose(plist_id);
 

From a4f508e2fba9c235d8a5a947876107baa958daa0 Mon Sep 17 00:00:00 2001
From: Alex Hocks <alex.hocks@tum.de>
Date: Fri, 2 Jan 2026 17:20:00 +0100
Subject: [PATCH 05/14] applied suggested fix for pixi.lock

---
 pixi.lock | 48 +++++++++++++++++++++++-------------------------
 1 file changed, 23 insertions(+), 25 deletions(-)

diff --git a/pixi.lock b/pixi.lock
index 86abe46..0679bc1 100644
--- a/pixi.lock
+++ b/pixi.lock
@@ -3,6 +3,8 @@ environments:
   dashboard:
     channels:
     - url: https://conda.anaconda.org/conda-forge/
+    options:
+      pypi-prerelease-mode: if-necessary-or-explicit
     packages:
       linux-64:
       - conda: https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2
@@ -1454,6 +1456,8 @@ environments:
   default:
     channels:
     - url: https://conda.anaconda.org/conda-forge/
+    options:
+      pypi-prerelease-mode: if-necessary-or-explicit
     packages:
       linux-64:
       - conda: https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2
@@ -2249,7 +2253,7 @@ environments:
       - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/zstandard-0.25.0-py314h2e8dab5_1.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/zstd-1.5.7-hbcf94c1_2.conda
       - conda: .
-        subdir: linux-aarch64
+        build: he8cfe8b_0
       osx-64:
       - conda: https://conda.anaconda.org/conda-forge/noarch/_python_abi3_support-1.0-hd8ed1ab_2.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/aiobotocore-2.25.2-pyhcf101f3_0.conda
@@ -2596,7 +2600,7 @@ environments:
       - conda: https://conda.anaconda.org/conda-forge/osx-64/zstandard-0.25.0-py314hd1e8ddb_1.conda
       - conda: https://conda.anaconda.org/conda-forge/osx-64/zstd-1.5.7-h8210216_2.conda
       - conda: .
-        subdir: osx-64
+        build: h0dc7051_0
       osx-arm64:
       - conda: https://conda.anaconda.org/conda-forge/noarch/_python_abi3_support-1.0-hd8ed1ab_2.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/aiobotocore-2.25.2-pyhcf101f3_0.conda
@@ -2947,6 +2951,8 @@ environments:
   dev:
     channels:
     - url: https://conda.anaconda.org/conda-forge/
+    options:
+      pypi-prerelease-mode: if-necessary-or-explicit
     packages:
       linux-64:
       - conda: https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2
@@ -6119,57 +6125,49 @@ packages:
 - conda: .
   name: fans
   version: 0.6.1
-  build: h1235946_0
-  subdir: linux-64
+  build: h0dc7051_0
+  subdir: osx-64
+  variants:
+    target_platform: osx-64
   depends:
-  - libstdcxx >=15
-  - libgcc >=15
+  - libcxx >=21
   - hdf5 >=1.14.6,<1.14.7.0a0 mpi_openmpi_*
   - fftw >=3.3.10,<4.0a0
   - fftw * mpi_openmpi_*
-  input:
-    hash: 81734781fed742a3ee906fd6767759243525440b3f419cd0622b90e4d0f107fe
-    globs: []
 - conda: .
   name: fans
   version: 0.6.1
-  build: hbf21a9e_0
-  subdir: linux-aarch64
+  build: h1235946_0
+  subdir: linux-64
   depends:
   - libstdcxx >=15
   - libgcc >=15
   - hdf5 >=1.14.6,<1.14.7.0a0 mpi_openmpi_*
   - fftw >=3.3.10,<4.0a0
   - fftw * mpi_openmpi_*
-  input:
-    hash: 8ad54598e221efef736cec65db9e327856f556e55f81445cda5620e181ef8813
-    globs: []
 - conda: .
   name: fans
   version: 0.6.1
-  build: hbf21a9e_0
-  subdir: osx-64
+  build: hcb8d3e5_0
+  subdir: osx-arm64
   depends:
   - libcxx >=21
   - hdf5 >=1.14.6,<1.14.7.0a0 mpi_openmpi_*
   - fftw >=3.3.10,<4.0a0
   - fftw * mpi_openmpi_*
-  input:
-    hash: 8ad54598e221efef736cec65db9e327856f556e55f81445cda5620e181ef8813
-    globs: []
 - conda: .
   name: fans
   version: 0.6.1
-  build: hcb8d3e5_0
-  subdir: osx-arm64
+  build: he8cfe8b_0
+  subdir: linux-aarch64
+  variants:
+    target_platform: linux-aarch64
   depends:
-  - libcxx >=21
+  - libstdcxx >=15
+  - libgcc >=15
   - hdf5 >=1.14.6,<1.14.7.0a0 mpi_openmpi_*
   - fftw >=3.3.10,<4.0a0
   - fftw * mpi_openmpi_*
-  input:
-    hash: 81734781fed742a3ee906fd6767759243525440b3f419cd0622b90e4d0f107fe
-    globs: []
 - conda: https://conda.anaconda.org/conda-forge/linux-64/fftw-3.3.10-mpi_openmpi_h99e62ba_10.conda
   sha256: 59a1fd0daa71bd5529e19b4da8aae2ded4d4ef05e5e31d80c39cbe2fc7664b6a
   md5: 3fcbf2ef5f3a8f0ed5683f60fe08293b

From d179d7b0362f429619c53768e927bf352866a117 Mon Sep 17 00:00:00 2001
From: Alex Hocks <73783301+Snapex2409@users.noreply.github.com>
Date: Fri, 2 Jan 2026 17:23:20 +0100
Subject: [PATCH 06/14] Update CHANGELOG.md

---
 CHANGELOG.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 414e660..61fb5d6 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,7 +2,7 @@
 
 ## v0.6.1
 
-- Add pyFANS support for large strain [#124](https://github.com/DataAnalyticsEngineering/FANS/pull/124)
+- Add pyFANS support for large strain and fix pyFANS MPI [#124](https://github.com/DataAnalyticsEngineering/FANS/pull/124)
 - Adds support to write integration point data to disk [#117](https://github.com/DataAnalyticsEngineering/FANS/pull/117)
 
 ## v0.6.0

From 807d347b6726814a2f514c8dce9ab0aca9661064 Mon Sep 17 00:00:00 2001
From: Alex Hocks <alex.hocks@tum.de>
Date: Fri, 2 Jan 2026 17:56:29 +0100
Subject: [PATCH 07/14] undo merge mistake

---
 include/reader.h |  99 +++++++++++++++++++++++-------------
 include/solver.h | 130 ++++++++++++++++++++++++++++++++++++++---------
 2 files changed, 170 insertions(+), 59 deletions(-)

diff --git a/include/reader.h b/include/reader.h
index be947ae..8b65c7f 100644
--- a/include/reader.h
+++ b/include/reader.h
@@ -77,10 +77,10 @@ class Reader {
     void writeData(const char *fieldName, int load_idx, int time_idx, T *data, hsize_t *dims, int ndims);
 
     template <typename T>
-    void writeSlab(const char *fieldName, int load_idx, int time_idx, T *data, int size);
+    void writeSlab(const char *fieldName, int load_idx, int time_idx, T *data, const std::vector<int> &extra_dims);
 
     template <typename T>
-    void WriteSlab(T *data, int _howmany, const char *dset_name);
+    void WriteSlab(T *data, const std::vector<int> &extra_dims, const char *dset_name);
 
     template <typename T>
     void WriteData(T *data, const char *dset_name, hsize_t *dims, int rank);
@@ -161,22 +161,25 @@ void Reader::WriteData(T *data, const char *dset_name, hsize_t *dims, int rank)
 
 // this function has to be here because of the template
 /* ---------------------------------------------------------------------------
- * Write a 4-D slab (local_n0 × Ny × Nz × howmany) from THIS MPI rank into an
- * HDF5 dataset whose global layout on disk is
+ * Write an N-D slab (local_n0 × Ny × Nz × d1 × d2 × ...) from THIS MPI rank
+ * into an HDF5 dataset whose global layout on disk is
  *
- *            Z  Y  X  k   (i.e. "zyx" + extra dim k)
+ *            Z  Y  X  d1  d2  ...   (i.e. "zyx" + extra dims)
  *
  * The caller supplies the data in logical order
- *            X  Y  Z  k .
+ *            X  Y  Z  d1  d2  ... .
  *
- * We therefore transpose in-memory once per write call.
+ * We therefore transpose the first 3 dimensions once per write call.
  * --------------------------------------------------------------------------*/
 template <typename T>
 void Reader::WriteSlab(
-    T          *data,     // in:  local slab, layout [X][Y][Z][k]
-    int         _howmany, // global size of the 4th axis (k)
-    const char *dset_name)
+    T                      *data,       // in:  local slab, layout [X][Y][Z][d1][d2]...
+    const std::vector<int> &extra_dims, // sizes of extra dimensions [d1, d2, ...]
+    const char             *dset_name)
 {
+    if (extra_dims.empty()) {
+        throw std::invalid_argument("WriteSlab: extra_dims cannot be empty");
+    }
     /*------------------------------------------------------------------*/
     /* 0. map C++ type -> native HDF5 type                              */
     /*------------------------------------------------------------------*/
@@ -198,7 +201,6 @@ void Reader::WriteSlab(
     /* 1. open or create the HDF5 file                                  */
     /*------------------------------------------------------------------*/
     hid_t file_id = results_file_id;
-    hid_t plist_id;
     if (file_id < 0) {
         throw std::runtime_error("WriteSlab: results file is not open");
     }
@@ -206,15 +208,20 @@ void Reader::WriteSlab(
     void *old_client_data;
 
     /*------------------------------------------------------------------*/
-    /* 2. create the dataset (global dims =  Z Y X k ) if necessary     */
+    /* 2. create the dataset (global dims = Z Y X d1 d2 ...) if needed  */
     /*------------------------------------------------------------------*/
-    const hsize_t Nx   = static_cast<hsize_t>(dims[0]);
-    const hsize_t Ny   = static_cast<hsize_t>(dims[1]);
-    const hsize_t Nz   = static_cast<hsize_t>(dims[2]);
-    const hsize_t kDim = static_cast<hsize_t>(_howmany);
-
-    const int rank        = 4;
-    hsize_t   dimsf[rank] = {Nz, Ny, Nx, kDim}; /* <<< Z Y X k */
+    const hsize_t Nx = static_cast<hsize_t>(dims[0]);
+    const hsize_t Ny = static_cast<hsize_t>(dims[1]);
+    const hsize_t Nz = static_cast<hsize_t>(dims[2]);
+
+    const int            rank = 3 + extra_dims.size();
+    std::vector<hsize_t> dimsf(rank);
+    dimsf[0] = Nz;
+    dimsf[1] = Ny;
+    dimsf[2] = Nx;
+    for (size_t i = 0; i < extra_dims.size(); ++i) {
+        dimsf[3 + i] = static_cast<hsize_t>(extra_dims[i]);
+    }
 
     safe_create_group(file_id, dset_name);
 
@@ -226,7 +233,7 @@ void Reader::WriteSlab(
     H5Eset_auto(H5E_DEFAULT, old_func, old_client_data); /* restore */
 
     if (dset_id < 0) {
-        hid_t filespace = H5Screate_simple(rank, dimsf, nullptr);
+        hid_t filespace = H5Screate_simple(rank, dimsf.data(), nullptr);
         dset_id         = H5Dcreate2(file_id, dset_name, data_type,
                                      filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
         H5Sclose(filespace);
@@ -249,20 +256,40 @@ void Reader::WriteSlab(
     /*------------------------------------------------------------------*/
     /* 3. build FILE hyperslab  (slice along file-dim 2 = X axis)       */
     /*------------------------------------------------------------------*/
-    hsize_t fcount[4]  = {Nz, Ny, static_cast<hsize_t>(local_n0), kDim};
-    hsize_t foffset[4] = {0, 0, static_cast<hsize_t>(local_0_start), 0};
+    std::vector<hsize_t> fcount(rank);
+    std::vector<hsize_t> foffset(rank);
+
+    fcount[0] = Nz;
+    fcount[1] = Ny;
+    fcount[2] = static_cast<hsize_t>(local_n0);
+    for (size_t i = 0; i < extra_dims.size(); ++i) {
+        fcount[3 + i] = static_cast<hsize_t>(extra_dims[i]);
+    }
+
+    foffset[0] = 0;
+    foffset[1] = 0;
+    foffset[2] = static_cast<hsize_t>(local_0_start);
+    for (size_t i = 0; i < extra_dims.size(); ++i) {
+        foffset[3 + i] = 0;
+    }
 
     hid_t filespace = H5Dget_space(dset_id);
-    H5Sselect_hyperslab(filespace, H5S_SELECT_SET, foffset, nullptr, fcount, nullptr);
+    H5Sselect_hyperslab(filespace, H5S_SELECT_SET, foffset.data(), nullptr, fcount.data(), nullptr);
 
     /*------------------------------------------------------------------*/
-    /* 4. transpose local slab  [X][Y][Z][k]  ->  [Z][Y][X][k]          */
+    /* 4. transpose local slab  [X][Y][Z][d1][d2]...  ->  [Z][Y][X][d1][d2]... */
     /*------------------------------------------------------------------*/
-    const Eigen::Index NxLoc     = static_cast<Eigen::Index>(local_n0);
-    const Eigen::Index NyLoc     = static_cast<Eigen::Index>(Ny);
-    const Eigen::Index NzLoc     = static_cast<Eigen::Index>(Nz);
-    const Eigen::Index kLoc      = static_cast<Eigen::Index>(kDim);
-    const size_t       slabElems = static_cast<size_t>(NxLoc * NyLoc * NzLoc * kLoc);
+    const Eigen::Index NxLoc = static_cast<Eigen::Index>(local_n0);
+    const Eigen::Index NyLoc = static_cast<Eigen::Index>(Ny);
+    const Eigen::Index NzLoc = static_cast<Eigen::Index>(Nz);
+
+    // Flatten extra dimensions into one for Eigen Tensor (which supports up to rank 4)
+    size_t extra_size = 1;
+    for (int d : extra_dims)
+        extra_size *= d;
+    const Eigen::Index extraLoc = static_cast<Eigen::Index>(extra_size);
+
+    const size_t slabElems = static_cast<size_t>(NxLoc * NyLoc * NzLoc * extraLoc);
 
     T *tmp = static_cast<T *>(fftw_malloc(slabElems * sizeof(T)));
     if (!tmp) {
@@ -270,21 +297,21 @@ void Reader::WriteSlab(
     }
 
     Eigen::TensorMap<Eigen::Tensor<const T, 4, Eigen::RowMajor>>
-        input_tensor(data, NxLoc, NyLoc, NzLoc, kLoc);
+        input_tensor(data, NxLoc, NyLoc, NzLoc, extraLoc);
 
     Eigen::TensorMap<Eigen::Tensor<T, 4, Eigen::RowMajor>>
-        output_tensor(tmp, NzLoc, NyLoc, NxLoc, kLoc);
+        output_tensor(tmp, NzLoc, NyLoc, NxLoc, extraLoc);
 
-    // Permute dimensions: (0,1,2,3) -> (2,1,0,3) swaps X and Z
+    // Permute dimensions: (0,1,2,3) -> (2,1,0,3) swaps X and Z, keeps extra dims
     Eigen::array<Eigen::Index, 4> shuffle_dims = {2, 1, 0, 3};
     output_tensor                              = input_tensor.shuffle(shuffle_dims);
 
     /*------------------------------------------------------------------*/
     /* 5. MEMORY dataspace matches FILE slab exactly                    */
     /*------------------------------------------------------------------*/
-    hid_t memspace = H5Screate_simple(4, fcount, nullptr);
+    hid_t memspace = H5Screate_simple(rank, fcount.data(), nullptr);
 
-    plist_id = H5Pcreate(H5P_DATASET_XFER);
+    hid_t plist_id = H5Pcreate(H5P_DATASET_XFER);
     H5Pset_dxpl_mpio(plist_id, H5FD_MPIO_COLLECTIVE);
 
     herr_t status = H5Dwrite(dset_id, data_type,
@@ -314,14 +341,14 @@ void Reader::writeData(const char *fieldName, int load_idx, int time_idx, T *dat
 }
 
 template <typename T>
-void Reader::writeSlab(const char *fieldName, int load_idx, int time_idx, T *data, int size)
+void Reader::writeSlab(const char *fieldName, int load_idx, int time_idx, T *data, const std::vector<int> &extra_dims)
 {
     if (std::find(resultsToWrite.begin(), resultsToWrite.end(), fieldName) == resultsToWrite.end()) {
         return;
     }
     char name[5096];
     snprintf(name, sizeof(name), "%s/load%i/time_step%i/%s", dataset_name, load_idx, time_idx, fieldName);
-    WriteSlab(data, size, name);
+    WriteSlab(data, extra_dims, name);
 }
 
 #endif
diff --git a/include/solver.h b/include/solver.h
index e14e33f..074a909 100644
--- a/include/solver.h
+++ b/include/solver.h
@@ -456,8 +456,37 @@ double Solver<howmany, n_str>::compute_error(RealArray &r)
 template <int howmany, int n_str>
 void Solver<howmany, n_str>::postprocess(Reader &reader, int load_idx, int time_idx)
 {
-    VectorXd strain         = VectorXd::Zero(local_n0 * n_y * n_z * n_str);
-    VectorXd stress         = VectorXd::Zero(local_n0 * n_y * n_z * n_str);
+    int n_gp = matmanager->models[0]->n_gp;
+
+    // Check what user requested
+    auto &results        = reader.resultsToWrite;
+    bool  need_stress    = std::find(results.begin(), results.end(), "stress") != results.end();
+    bool  need_stress_gp = std::find(results.begin(), results.end(), "stress_gp") != results.end();
+    bool  need_strain    = std::find(results.begin(), results.end(), "strain") != results.end();
+    bool  need_strain_gp = std::find(results.begin(), results.end(), "strain_gp") != results.end();
+
+    bool need_global_avg = std::find(results.begin(), results.end(), "stress_average") != results.end() ||
+                           std::find(results.begin(), results.end(), "strain_average") != results.end();
+
+    bool need_phase_avg = false;
+    for (int mat_idx = 0; mat_idx < reader.n_mat; ++mat_idx) {
+        char name[512];
+        sprintf(name, "phase_stress_average_phase%d", mat_idx);
+        if (std::find(results.begin(), results.end(), name) != results.end()) {
+            need_phase_avg = true;
+            break;
+        }
+    }
+
+    // Determine if we need to compute stress/strain at all
+    bool need_compute = need_stress || need_stress_gp || need_strain || need_strain_gp || need_global_avg || need_phase_avg;
+
+    // Conditional allocation
+    VectorXd *strain_elem = need_strain ? new VectorXd(local_n0 * n_y * n_z * n_str) : nullptr;
+    VectorXd *stress_elem = need_stress ? new VectorXd(local_n0 * n_y * n_z * n_str) : nullptr;
+    VectorXd *strain_gp   = need_strain_gp ? new VectorXd(local_n0 * n_y * n_z * n_gp * n_str) : nullptr;
+    VectorXd *stress_gp   = need_stress_gp ? new VectorXd(local_n0 * n_y * n_z * n_gp * n_str) : nullptr;
+
     VectorXd stress_average = VectorXd::Zero(n_str);
     VectorXd strain_average = VectorXd::Zero(n_str);
 
@@ -472,23 +501,61 @@ void Solver<howmany, n_str>::postprocess(Reader &reader, int load_idx, int time_
 
     Matrix<double, howmany * 8, 1> ue;
     int                            phase_id;
-    iterateCubes<0>([&](ptrdiff_t *idx, ptrdiff_t *idxPadding) {
-        for (int i = 0; i < 8; ++i) {
-            for (int j = 0; j < howmany; ++j) {
-                ue(howmany * i + j, 0) = v_u[howmany * idx[i] + j];
+
+    if (need_compute) {
+        iterateCubes<0>([&](ptrdiff_t *idx, ptrdiff_t *idxPadding) {
+            for (int i = 0; i < 8; ++i) {
+                for (int j = 0; j < howmany; ++j) {
+                    ue(howmany * i + j, 0) = v_u[howmany * idx[i] + j];
+                }
             }
-        }
-        phase_id = ms[idx[0]];
+            phase_id = ms[idx[0]];
 
-        const MaterialInfo<howmany, n_str> &info = matmanager->get_info(phase_id);
-        info.model->getStrainStress(strain.segment(n_str * idx[0], n_str).data(), stress.segment(n_str * idx[0], n_str).data(), ue, info.local_mat_id, idx[0]);
-        stress_average += stress.segment(n_str * idx[0], n_str);
-        strain_average += strain.segment(n_str * idx[0], n_str);
+            const MaterialInfo<howmany, n_str> &info = matmanager->get_info(phase_id);
 
-        phase_stress_average[phase_id] += stress.segment(n_str * idx[0], n_str);
-        phase_strain_average[phase_id] += strain.segment(n_str * idx[0], n_str);
-        phase_counts[phase_id]++;
-    });
+            // Temporary storage for element averages
+            double elem_strain_avg[n_str];
+            double elem_stress_avg[n_str];
+
+            // Compute once - populates internal eps/sigma
+            info.model->getStrainStress(elem_strain_avg, elem_stress_avg, ue, info.local_mat_id, idx[0]);
+
+            // Store element averages if requested
+            if (need_stress) {
+                for (int c = 0; c < n_str; ++c) {
+                    (*stress_elem)(idx[0] * n_str + c) = elem_stress_avg[c];
+                }
+            }
+            if (need_strain) {
+                for (int c = 0; c < n_str; ++c) {
+                    (*strain_elem)(idx[0] * n_str + c) = elem_strain_avg[c];
+                }
+            }
+
+            // Copy all GP data if requested
+            if (need_stress_gp) {
+                memcpy(&(*stress_gp)(idx[0] * n_gp * n_str),
+                       info.model->get_sigma_data(),
+                       n_gp * n_str * sizeof(double));
+            }
+            if (need_strain_gp) {
+                memcpy(&(*strain_gp)(idx[0] * n_gp * n_str),
+                       info.model->get_eps_data(),
+                       n_gp * n_str * sizeof(double));
+            }
+
+            // Accumulate for global and phase averages
+            if (need_global_avg || need_phase_avg) {
+                for (int c = 0; c < n_str; ++c) {
+                    stress_average(c) += elem_stress_avg[c];
+                    strain_average(c) += elem_strain_avg[c];
+                    phase_stress_average[phase_id](c) += elem_stress_avg[c];
+                    phase_strain_average[phase_id](c) += elem_strain_avg[c];
+                }
+                phase_counts[phase_id]++;
+            }
+        });
+    }
 
     MPI_Allreduce(MPI_IN_PLACE, stress_average.data(), n_str, MPI_DOUBLE, MPI_SUM, communicator);
     MPI_Allreduce(MPI_IN_PLACE, strain_average.data(), n_str, MPI_DOUBLE, MPI_SUM, communicator);
@@ -599,16 +666,33 @@ void Solver<howmany, n_str>::postprocess(Reader &reader, int load_idx, int time_
     reader.writeData("absolute_error", load_idx, time_idx, err_all.data(), dims, 1);
 
     vector<int> rank_field(local_n0 * n_y * n_z, world_rank);
-    reader.writeSlab("mpi_rank", load_idx, time_idx, rank_field.data(), 1);
-    reader.writeSlab("microstructure", load_idx, time_idx, ms, 1);
-    reader.writeSlab("displacement_fluctuation", load_idx, time_idx, v_u, howmany);
-    reader.writeSlab("displacement", load_idx, time_idx, u_total.data(), howmany);
-    reader.writeSlab("residual", load_idx, time_idx, v_r, howmany);
-    reader.writeSlab("strain", load_idx, time_idx, strain.data(), n_str);
-    reader.writeSlab("stress", load_idx, time_idx, stress.data(), n_str);
+    reader.writeSlab("mpi_rank", load_idx, time_idx, rank_field.data(), {1});
+    reader.writeSlab("microstructure", load_idx, time_idx, ms, {1});
+    reader.writeSlab("displacement_fluctuation", load_idx, time_idx, v_u, {howmany});
+    reader.writeSlab("displacement", load_idx, time_idx, u_total.data(), {howmany});
+    reader.writeSlab("residual", load_idx, time_idx, v_r, {howmany});
+
+    if (need_strain)
+        reader.writeSlab("strain", load_idx, time_idx, strain_elem->data(), {n_str});
+    if (need_stress)
+        reader.writeSlab("stress", load_idx, time_idx, stress_elem->data(), {n_str});
+    if (need_strain_gp)
+        reader.writeSlab("strain_gp", load_idx, time_idx, strain_gp->data(), {n_gp, n_str});
+    if (need_stress_gp)
+        reader.writeSlab("stress_gp", load_idx, time_idx, stress_gp->data(), {n_gp, n_str});
 
     matmanager->postprocess(*this, reader, load_idx, time_idx);
 
+    // Cleanup
+    if (strain_elem)
+        delete strain_elem;
+    if (stress_elem)
+        delete stress_elem;
+    if (strain_gp)
+        delete strain_gp;
+    if (stress_gp)
+        delete stress_gp;
+
     // Compute homogenized tangent only if requested
     if (find(reader.resultsToWrite.begin(), reader.resultsToWrite.end(), "homogenized_tangent") != reader.resultsToWrite.end()) {
         homogenized_tangent = get_homogenized_tangent(1e-6);

From 633fda7e6d5b78feb25b758e629de9305646d5b6 Mon Sep 17 00:00:00 2001
From: Alex Hocks <alex.hocks@tum.de>
Date: Fri, 2 Jan 2026 18:01:04 +0100
Subject: [PATCH 08/14] remove debug msg

---
 pyfans/micro.cpp | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/pyfans/micro.cpp b/pyfans/micro.cpp
index 9658119..5283570 100644
--- a/pyfans/micro.cpp
+++ b/pyfans/micro.cpp
@@ -36,11 +36,7 @@ MicroSimulation::MicroSimulation(int sim_id, char *input_file)
     reader.force_single_rank = true;
     reader.communicator = MPI_COMM_SELF;
 
-    int rank = -1;
-    MPI_Comm_rank(reader.communicator, &rank);
-
     reader.ReadMS(3);
-    printf(">>read input%d id %d\n", rank, sim_id);
 
     if (reader.strain_type == "small") {
         matmanager = createMaterialManager<3, 6>(reader);
@@ -50,7 +46,6 @@ MicroSimulation::MicroSimulation(int sim_id, char *input_file)
         matmanager = createMaterialManager<3, 9>(reader);
         solver     = createSolver<3, 9>(reader, std::get<MaterialManager<3, 9>*>(matmanager));
     }
-    printf(">>constructed %d id %d\n", rank, sim_id);
 }
 
 py::dict MicroSimulation::solve(py::dict macro_data, double dt)

From 9fedc080e8a3e76de121e15d27ee9903bffeda2f Mon Sep 17 00:00:00 2001
From: Alex Hocks <alex.hocks@tum.de>
Date: Fri, 2 Jan 2026 18:03:43 +0100
Subject: [PATCH 09/14] update lock file

---
 pixi.lock | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/pixi.lock b/pixi.lock
index 0679bc1..1c1db95 100644
--- a/pixi.lock
+++ b/pixi.lock
@@ -6139,6 +6139,9 @@ packages:
   version: 0.6.1
   build: h1235946_0
   subdir: linux-64
+  variants:
+    cxx_compiler: clangxx
+    target_platform: linux-64
   depends:
   - libstdcxx >=15
   - libgcc >=15
@@ -6150,6 +6153,9 @@ packages:
   version: 0.6.1
   build: hcb8d3e5_0
   subdir: osx-arm64
+  variants:
+    cxx_compiler: clangxx
+    target_platform: osx-arm64
   depends:
   - libcxx >=21
   - hdf5 >=1.14.6,<1.14.7.0a0 mpi_openmpi_*

From 12df07a009a52c50013e06e7789ea2d0b3f45102 Mon Sep 17 00:00:00 2001
From: Ishaan Desai <ishaandesai@gmail.com>
Date: Fri, 2 Jan 2026 20:22:37 +0100
Subject: [PATCH 10/14] Formatting

---
 include/reader.h |  4 ++--
 include/solver.h |  2 +-
 pyfans/micro.cpp | 26 ++++++++++++--------------
 pyfans/micro.hpp | 10 +++++-----
 src/reader.cpp   |  7 +++----
 5 files changed, 23 insertions(+), 26 deletions(-)

diff --git a/include/reader.h b/include/reader.h
index 8b65c7f..ef6a87a 100644
--- a/include/reader.h
+++ b/include/reader.h
@@ -22,8 +22,8 @@ class Reader {
     ~Reader();
 
     // MPI controlls
-    bool             force_single_rank;
-    MPI_Comm         communicator;
+    bool     force_single_rank;
+    MPI_Comm communicator;
 
     // contents of input file:
     char             ms_filename[4096]{};    // Name of Micro-structure hdf5 file
diff --git a/include/solver.h b/include/solver.h
index 074a909..36d36fb 100644
--- a/include/solver.h
+++ b/include/solver.h
@@ -18,7 +18,7 @@ class Solver : private MixedBCController<howmany> {
 
     const int world_rank;
     const int world_size;
-    MPI_Comm communicator;
+    MPI_Comm  communicator;
 
     const ptrdiff_t n_x, n_y, n_z;
     // NOTE: the order in the declaration is very important because it is the same order in which the later initialization via member initializer lists takes place
diff --git a/pyfans/micro.cpp b/pyfans/micro.cpp
index 5283570..9eb6861 100644
--- a/pyfans/micro.cpp
+++ b/pyfans/micro.cpp
@@ -34,23 +34,22 @@ MicroSimulation::MicroSimulation(int sim_id, char *input_file)
     reader.ReadInputFile(input_file);
 
     reader.force_single_rank = true;
-    reader.communicator = MPI_COMM_SELF;
+    reader.communicator      = MPI_COMM_SELF;
 
     reader.ReadMS(3);
 
     if (reader.strain_type == "small") {
         matmanager = createMaterialManager<3, 6>(reader);
-        solver     = createSolver<3, 6>(reader, std::get<MaterialManager<3, 6>*>(matmanager));
-    }
-    else {
+        solver     = createSolver<3, 6>(reader, std::get<MaterialManager<3, 6> *>(matmanager));
+    } else {
         matmanager = createMaterialManager<3, 9>(reader);
-        solver     = createSolver<3, 9>(reader, std::get<MaterialManager<3, 9>*>(matmanager));
+        solver     = createSolver<3, 9>(reader, std::get<MaterialManager<3, 9> *>(matmanager));
     }
 }
 
 py::dict MicroSimulation::solve(py::dict macro_data, double dt)
 {
-    const bool is_small_strain = std::holds_alternative<MaterialManager<3, 6>*>(matmanager);
+    const bool is_small_strain = std::holds_alternative<MaterialManager<3, 6> *>(matmanager);
     // Time step value dt is not used currently, but is available for future use
 
     // Create a pybind style Numpy array from macro_write_data["micro_vector_data"], which is a Numpy array
@@ -60,20 +59,20 @@ py::dict MicroSimulation::solve(py::dict macro_data, double dt)
     py::array_t<double> strain = merge_arrays(strain1, strain2);
     if (not is_small_strain) {
         py::array_t<double> strain3 = macro_data["strains7to9"].cast<py::array_t<double>>();
-        strain = merge_arrays(strain, strain3);
+        strain                      = merge_arrays(strain, strain3);
     }
 
-    std::vector<double> g0     = std::vector<double>(strain.data(), strain.data() + strain.size()); // convert numpy array to std::vector.
+    std::vector<double> g0 = std::vector<double>(strain.data(), strain.data() + strain.size()); // convert numpy array to std::vector.
 
     VectorXd homogenized_stress;
 
-    std::visit([&](auto& mm){mm->set_gradient(g0);}, matmanager);
+    std::visit([&](auto &mm) { mm->set_gradient(g0); }, matmanager);
 
-    std::visit([](auto& s){s->solve();}, solver);
+    std::visit([](auto &s) { s->solve(); }, solver);
 
-    homogenized_stress = std::visit([](auto& s) -> VectorXd { return s->get_homogenized_stress();}, solver);
+    homogenized_stress = std::visit([](auto &s) -> VectorXd { return s->get_homogenized_stress(); }, solver);
 
-    auto C = std::visit([&](auto& s) -> MatrixXd {return s->get_homogenized_tangent(pert_param);}, solver);
+    auto C = std::visit([&](auto &s) -> MatrixXd { return s->get_homogenized_tangent(pert_param); }, solver);
 
     // Convert data to a py::dict again to send it back to the Micro Manager
     py::dict micro_write_data;
@@ -98,8 +97,7 @@ py::dict MicroSimulation::solve(py::dict macro_data, double dt)
         micro_write_data["cmat6"]        = C_6;
         std::vector<double> C_7          = {C(4, 4), C(4, 5), C(5, 5)};
         micro_write_data["cmat7"]        = C_7;
-    }
-    else {
+    } else {
         std::vector<double> stress13     = {homogenized_stress[0], homogenized_stress[1], homogenized_stress[2]};
         micro_write_data["stresses1to3"] = stress13;
         std::vector<double> stress46     = {homogenized_stress[3], homogenized_stress[4], homogenized_stress[5]};
diff --git a/pyfans/micro.hpp b/pyfans/micro.hpp
index 7eafc2a..1e6ee55 100644
--- a/pyfans/micro.hpp
+++ b/pyfans/micro.hpp
@@ -27,9 +27,9 @@ class MicroSimulation {
     int    _sim_id;
     Reader reader;
     // Hardcoding mechanical models because these definitions need information from the input file.
-    using matmanager_t = std::variant<MaterialManager<3, 6>*, MaterialManager<3, 9>*>;
-    using solver_t     = std::variant<Solver<3, 6>*, Solver<3, 9>*>;
-    matmanager_t          matmanager;
-    solver_t              solver;
-    double                 pert_param = 1e-6; // scalar strain perturbation parameter
+    using matmanager_t = std::variant<MaterialManager<3, 6> *, MaterialManager<3, 9> *>;
+    using solver_t     = std::variant<Solver<3, 6> *, Solver<3, 9> *>;
+    matmanager_t matmanager;
+    solver_t     solver;
+    double       pert_param = 1e-6; // scalar strain perturbation parameter
 };
diff --git a/src/reader.cpp b/src/reader.cpp
index 542127e..7894991 100644
--- a/src/reader.cpp
+++ b/src/reader.cpp
@@ -70,11 +70,10 @@ void Reader ::ReadInputFile(char input_fn[])
 
         if (j.contains("no_mpi")) {
             force_single_rank = true;
-            communicator = MPI_COMM_SELF;
-        }
-        else {
+            communicator      = MPI_COMM_SELF;
+        } else {
             force_single_rank = false;
-            communicator = MPI_COMM_WORLD;
+            communicator      = MPI_COMM_WORLD;
         }
 
         MPI_Comm_rank(communicator, &world_rank);

From ca926d0766d39ce78b1329d821bad5c9df530cda Mon Sep 17 00:00:00 2001
From: Alex Hocks <alex.hocks@tum.de>
Date: Sat, 3 Jan 2026 16:34:06 +0100
Subject: [PATCH 11/14] added logging and comp time log enable

---
 CMakeLists.txt            |   5 ++
 include/MaterialManager.h |  79 ++++++++++----------
 include/general.h         |   2 -
 include/logging.h         |  78 ++++++++++++++++++++
 include/solver.h          |  61 +++++++---------
 include/solverCG.h        |  13 ++--
 include/solverFP.h        |   7 +-
 pyfans/micro.cpp          |   5 ++
 pyfans/micro.hpp          |   1 +
 src/logging.cpp           | 147 ++++++++++++++++++++++++++++++++++++++
 src/main.cpp              |  20 +++---
 src/reader.cpp            |  74 +++++++++----------
 12 files changed, 350 insertions(+), 142 deletions(-)
 create mode 100755 include/logging.h
 create mode 100755 src/logging.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 85a7736..efb3c1f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -112,6 +112,9 @@ find_package(nlohmann_json REQUIRED)
 # TARGETS
 # ##############################################################################
 
+set(FANS_VERBOSITY 5 CACHE STRING "Set verbosity level: 0 none, 5 all")
+add_definitions(-DVERBOSITY=${FANS_VERBOSITY})
+
 option(FANS_BUILD_STATIC "Build static library" OFF)
 if (FANS_BUILD_STATIC)
     add_library(FANS_FANS STATIC)
@@ -150,6 +153,7 @@ target_include_directories(FANS_FANS PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SO
 # FANS.h header that consuming projects can use
 set_property(TARGET FANS_FANS PROPERTY PUBLIC_HEADER
         include/general.h
+        include/logging.h
         include/matmodel.h
         include/MaterialManager.h
         include/reader.h
@@ -184,6 +188,7 @@ target_include_directories(FANS_main PRIVATE "${PROJECT_BINARY_DIR}/include")
 
 target_sources(FANS_FANS PRIVATE
         src/reader.cpp
+        src/logging.cpp
 )
 
 target_sources(FANS_main PRIVATE
diff --git a/include/MaterialManager.h b/include/MaterialManager.h
index cbff981..e1d7469 100644
--- a/include/MaterialManager.h
+++ b/include/MaterialManager.h
@@ -2,6 +2,7 @@
 #define MATERIALMANAGER_H
 
 #include "general.h"
+#include "logging.h"
 #include "matmodel.h"
 
 template <int howmany, int n_str>
@@ -110,51 +111,47 @@ class MaterialManager {
         compute_reference_stiffness(reader);
 
         // Print detailed information about material configuration for logging
-        if (reader.world_rank == 0) {
-            printf("\n# MaterialManager initialized:\n");
-            printf("#   Number of material models: %zu\n", models.size());
-            printf("#   Number of phases: %d\n#\n", n_phases);
-
-            for (size_t i = 0; i < mats.size(); ++i) {
-                const auto &mg = mats[i];
-                printf("# Material Model %zu: %s\n", i + 1, mg["matmodel"].get<string>().c_str());
-
-                // Print phases
-                auto phases = mg["phases"].get<vector<int>>();
-                printf("#   Phases: [");
-                for (size_t j = 0; j < phases.size(); ++j) {
-                    printf("%d", phases[j]);
-                    if (j < phases.size() - 1)
-                        printf(", ");
-                }
-                printf("]\n");
-
-                // Print material properties
-                printf("#   Material properties:\n");
-                const auto &props = mg["material_properties"];
-                for (auto it = props.begin(); it != props.end(); ++it) {
-                    printf("#     %s: ", it.key().c_str());
-                    if (it.value().is_array()) {
-                        printf("[");
-                        for (size_t k = 0; k < it.value().size(); ++k) {
-                            if (it.value()[k].is_number()) {
-                                printf("%.5g", it.value()[k].get<double>());
-                            } else if (it.value()[k].is_string()) {
-                                printf("\"%s\"", it.value()[k].get<string>().c_str());
-                            }
-                            if (k < it.value().size() - 1)
-                                printf(", ");
+        Log::io->info() << "\n# MaterialManager initialized:\n";
+        Log::io->info() << "#   Number of material models: " << models.size() << "\n";
+        Log::io->info() << "#   Number of phases: " << n_phases << "\n#\n";
+
+        for (size_t i = 0; i < mats.size(); ++i) {
+            const auto &mg = mats[i];
+            Log::io->info() << "# Material Model " << i + 1 << ": " << mg["matmodel"].get<string>() << "\n";
+
+            // Print phases
+            auto phases = mg["phases"].get<vector<int>>();
+            Log::io->info() << "#   Phases: [";
+            for (size_t j = 0; j < phases.size(); ++j) {
+                Log::io->info(true) << phases[j];
+                if (j < phases.size() - 1) Log::io->info(true) << ", ";
+            }
+            Log::io->info(true) << "]\n";
+
+            // Print material properties
+            Log::io->info() << "#   Material properties:\n";
+            const auto &props = mg["material_properties"];
+            for (auto it = props.begin(); it != props.end(); ++it) {
+                Log::io->info() << "#     " << it.key() << ": ";
+                if (it.value().is_array()) {
+                    Log::io->info(true) << "[";
+                    for (size_t k = 0; k < it.value().size(); ++k) {
+                        if (it.value()[k].is_number()) {
+                            Log::io->info(true) << std::setprecision(5) << it.value()[k].get<double>() << std::defaultfloat;
+                        } else if (it.value()[k].is_string()) {
+                            Log::io->info(true) << "\"" << it.value()[k].get<string>() << "\"";
                         }
-                        printf("]");
-                    } else if (it.value().is_number()) {
-                        printf("%.5g", it.value().get<double>());
-                    } else if (it.value().is_string()) {
-                        printf("\"%s\"", it.value().get<string>().c_str());
+                        if (k < it.value().size() - 1) Log::io->info(true) << ", ";
                     }
-                    printf("\n");
+                    Log::io->info(true) << "]";
+                } else if (it.value().is_number()) {
+                    Log::io->info(true) << std::setprecision(5) << it.value().get<double>() << std::defaultfloat;
+                } else if (it.value().is_string()) {
+                    Log::io->info(true) << "\"" << it.value().get<string>() << "\"";
                 }
-                printf("#\n");
+                Log::io->info(true) << "\n";
             }
+            Log::io->info() << "#\n";
         }
     }
 
diff --git a/include/general.h b/include/general.h
index 1128e8a..1949615 100644
--- a/include/general.h
+++ b/include/general.h
@@ -58,6 +58,4 @@ inline void FANS_free(V *p)
 }
 #endif // FANS_MALLOC_H
 
-#define VERBOSITY 0
-
 // #define EIGEN_RUNTIME_NO_MALLOC
diff --git a/include/logging.h b/include/logging.h
new file mode 100755
index 0000000..871ed0a
--- /dev/null
+++ b/include/logging.h
@@ -0,0 +1,78 @@
+#ifndef LOGGING_H
+#define LOGGING_H
+
+#include <chrono>
+#include <fstream>
+#include <iostream>
+#include <memory>
+
+namespace Log {
+
+enum Level {
+    Error,
+    Info,
+    Warn,
+    Debug,
+    All
+};
+
+[[maybe_unused]] const std::string level_to_string(Level level);
+
+extern Level active_level;
+
+class Logger {
+  public:
+    explicit Logger(std::string prefix, int comm_rank, int comm_size);
+
+    /// error > info > warn > debug > trace
+    std::ostream& error(bool append=false);
+    /// error > info > warn > debug > trace
+    std::ostream& info(bool append=false);
+    /// error > info > warn > debug > trace
+    std::ostream& warn(bool append=false);
+    /// error > info > warn > debug > trace
+    std::ostream& debug(bool append=false);
+    /// error > info > warn > debug > trace
+    std::ostream& trace(bool append=false);
+    /// progress bar
+    void progress(const std::string& prefix, int step, int max);
+
+  private:
+    /// starting time
+    std::chrono::steady_clock::time_point _start_time;
+    /// what the logger should always print first
+    const std::string _prefix;
+    /// empty stream to write nothing
+    std::ofstream _nullstr;
+    /// MPI comm rank
+    int _comm_rank;
+    /// MPI comm size
+    int _comm_size;
+
+};
+
+/**
+ * Creates all loggers and sets the level
+ * */
+void init(int comm_rank, int comm_size);
+
+/**
+ * Frees all memory
+ * */
+void finalize();
+
+/**
+ * Set activate rank
+ */
+[[maybe_unused]] void setActiveRank(int rank);
+
+/// logger with prefix [GENERAL]
+extern std::unique_ptr<Logger> general;
+/// logger with prefix [SOLVER]
+extern std::unique_ptr<Logger> solver;
+/// logger with prefix [IO]
+extern std::unique_ptr<Logger> io;
+
+}; // namespace Log
+
+#endif
\ No newline at end of file
diff --git a/include/solver.h b/include/solver.h
index cb05d49..8cc9740 100644
--- a/include/solver.h
+++ b/include/solver.h
@@ -3,6 +3,7 @@
 
 #include "matmodel.h"
 #include "MaterialManager.h"
+#include "logging.h"
 
 class J2Plasticity;
 
@@ -144,9 +145,8 @@ Solver<howmany, n_str>::Solver(Reader &reader, MaterialManager<howmany, n_str> *
 template <int howmany, int n_str>
 void Solver<howmany, n_str>::computeFundamentalSolution()
 {
-    if (world_rank == 0) {
-        printf("\n# Start creating Fundamental Solution(s) \n");
-    }
+    Log::solver->info() << "\n# Start creating Fundamental Solution(s) \n";
+
     clock_t tot_time = clock();
 
     Matrix<double, howmany * 8, howmany * 8> Ker0 = matmanager->models[0]->Compute_Reference_ElementStiffness(matmanager->kapparef_mat);
@@ -198,9 +198,7 @@ void Solver<howmany, n_str>::computeFundamentalSolution()
     fundamentalSolution /= (double) (n_x * n_y * n_z);
 
     tot_time = clock() - tot_time;
-    if (world_rank == 0) {
-        printf("# Complete; Time for construction of Fundamental Solution(s): %f seconds\n", double(tot_time) / CLOCKS_PER_SEC);
-    }
+    Log::solver->info() << "# Complete; Time for construction of Fundamental Solution(s): " << static_cast<double>(tot_time) / CLOCKS_PER_SEC << " seconds\n";
 }
 
 template <int howmany, int n_str>
@@ -282,20 +280,18 @@ void Solver<howmany, n_str>::compute_residual(RealArray &r_matrix, RealArray &u_
 template <int howmany, int n_str>
 void Solver<howmany, n_str>::solve()
 {
-
     err_all          = ArrayXd::Zero(n_it + 1);
     fft_time         = 0.0;
     clock_t tot_time = clock();
     internalSolve();
     tot_time = clock() - tot_time;
-    // if( VERBOSITY > 5 ){
-    if (world_rank == 0) {
-        printf("# FFT Time per iteration .......   %2.6f sec\n", double(fft_time) / CLOCKS_PER_SEC / iter);
-        printf("# Total FFT Time ...............   %2.6f sec\n", double(fft_time) / CLOCKS_PER_SEC);
-        printf("# Total Time per iteration .....   %2.6f sec\n", double(tot_time) / CLOCKS_PER_SEC / iter);
-        printf("# Total Time ...................   %2.6f sec\n", double(tot_time) / CLOCKS_PER_SEC);
-        printf("# FFT contribution to total time   %2.6f %% \n", 100. * double(fft_time) / double(tot_time));
-    }
+
+    Log::solver->info() << "# FFT Time per iteration .......   " << std::setw(2) << std::setprecision(6) << double(fft_time) / CLOCKS_PER_SEC / iter   << std::defaultfloat << " sec\n";
+    Log::solver->info() << "# Total FFT Time ...............   " << std::setw(2) << std::setprecision(6) << double(fft_time) / CLOCKS_PER_SEC          << std::defaultfloat << " sec\n";
+    Log::solver->info() << "# Total Time per iteration .....   " << std::setw(2) << std::setprecision(6) << double(tot_time) / CLOCKS_PER_SEC / iter   << std::defaultfloat << " sec\n";
+    Log::solver->info() << "# Total Time ...................   " << std::setw(2) << std::setprecision(6) << double(tot_time) / CLOCKS_PER_SEC          << std::defaultfloat << " sec\n";
+    Log::solver->info() << "# FFT contribution to total time   " << std::setw(2) << std::setprecision(6) << 100. * double(fft_time) / double(tot_time) << std::defaultfloat << " % \n";
+
     matmanager->update_internal_variables();
 }
 
@@ -433,13 +429,12 @@ double Solver<howmany, n_str>::compute_error(RealArray &r)
     double err0    = err_all[0];
     double err_rel = (iter == 0 ? 100 : err / err0);
 
-    if (world_rank == 0) {
-        if (iter == 0) {
-            printf("Before 1st iteration: %16.8e\n", err0);
-        } else {
-            printf("it %3lu .... err %16.8e  / %8.4e, ratio: %4.8e, FFT time: %2.6f sec\n", iter, err, err / err0, (iter == 1 ? 0.0 : err / err_all[iter - 1]), double(buftime) / CLOCKS_PER_SEC);
-        }
-    }
+    if (iter == 0) Log::solver->info() << "Before 1st iteration: " << std::setw(16) << std::setprecision(8) << err0 << std::defaultfloat << "\n";
+    else           Log::solver->info() << "it "                    << iter << " .... "
+                                       << "err "                   << std::setw(16) << std::setprecision(8) << err << std::defaultfloat << "  / "
+                                                                   << std::setw( 8) << std::setprecision(4) << err / err0 << std::defaultfloat
+                                       << ", ratio: "              << std::setw(4)  << std::setprecision(8) << (iter == 1 ? 0.0 : err / err_all[iter - 1]) << std::defaultfloat
+                                       << ", FFT time: "           << std::setw(2)  << std::setprecision(6) << static_cast<double>(buftime) / CLOCKS_PER_SEC << std::defaultfloat << " sec\n";
 
     const std::string &error_type = reader.errorParameters["type"].get<std::string>();
     if (error_type == "absolute") {
@@ -573,16 +568,12 @@ void Solver<howmany, n_str>::postprocess(Reader &reader, int load_idx, int time_
         }
     }
 
-    if (world_rank == 0) {
-        printf("# Effective Stress .. (");
-        for (int i = 0; i < n_str; ++i)
-            printf("%+.12f ", stress_average[i]);
-        printf(") \n");
-        printf("# Effective Strain .. (");
-        for (int i = 0; i < n_str; ++i)
-            printf("%+.12f ", strain_average[i]);
-        printf(") \n\n");
-    }
+    Log::solver->info() << "# Effective Stress .. (";
+    for (int i = 0; i < n_str; ++i) Log::solver->info(true) << std::showpos << std::fixed << std::setprecision(12) << stress_average[i] << " " << std::noshowpos << std::defaultfloat;
+    Log::solver->info(true) << ") \n";
+    Log::solver->info() << "# Effective Strain .. (";
+    for (int i = 0; i < n_str; ++i) Log::solver->info(true) << std::showpos << std::fixed << std::setprecision(12) << strain_average[i] << " " << std::noshowpos << std::defaultfloat;
+    Log::solver->info(true) << ") \n\n";
 
     /* ====================================================================== *
      *  u_total = g0·X  +  ũ          (vector or scalar, decided at compile time)
@@ -695,11 +686,7 @@ void Solver<howmany, n_str>::postprocess(Reader &reader, int load_idx, int time_
     if (find(reader.resultsToWrite.begin(), reader.resultsToWrite.end(), "homogenized_tangent") != reader.resultsToWrite.end()) {
         homogenized_tangent = get_homogenized_tangent(1e-6);
         hsize_t dims[2]     = {static_cast<hsize_t>(n_str), static_cast<hsize_t>(n_str)};
-        if (world_rank == 0) {
-            cout << "# Homogenized tangent: " << endl
-                 << setprecision(12) << homogenized_tangent << endl
-                 << endl;
-        }
+        Log::solver->info() << "# Homogenized tangent: \n" << std::setprecision(12) << homogenized_tangent << std::defaultfloat << "\n\n";
         reader.writeData("homogenized_tangent", load_idx, time_idx, homogenized_tangent.data(), dims, 2);
     }
     extrapolateDisplacement(); // prepare v_u for next time step
diff --git a/include/solverCG.h b/include/solverCG.h
index b02a428..513a34b 100644
--- a/include/solverCG.h
+++ b/include/solverCG.h
@@ -60,8 +60,7 @@ double SolverCG<howmany, n_str>::dotProduct(RealArray &a, RealArray &b)
 template <int howmany, int n_str>
 void SolverCG<howmany, n_str>::internalSolve()
 {
-    if (this->world_rank == 0)
-        printf("\n# Start FANS - Conjugate Gradient Solver \n");
+    Log::solver->info() << "\n# Start FANS - Conjugate Gradient Solver \n";
 
     bool islinear = this->matmanager->all_linear;
 
@@ -110,8 +109,8 @@ void SolverCG<howmany, n_str>::internalSolve()
         iter++;
         err_rel = this->compute_error(v_r_real);
     }
-    if (this->world_rank == 0)
-        printf("# Complete FANS - Conjugate Gradient Solver \n");
+
+    Log::solver->info() << "# Complete FANS - Conjugate Gradient Solver \n";
 }
 
 template <int howmany, int n_str>
@@ -142,8 +141,10 @@ void SolverCG<howmany, n_str>::LineSearchSecant()
     }
     v_u_real += d_real * (alpha_new - alpha_old);
     v_r_real = rnew_real;
-    if (this->world_rank == 0)
-        printf("line search iter %i, alpha %f - error %e - ", _iter, alpha_new, err);
+
+    Log::solver->info() << "line search iter " << _iter
+                        << ", alpha "  << std::fixed << std::setprecision(6) << alpha_new  << std::defaultfloat
+                        << " - error " << std::fixed << std::setprecision(6) << err        << std::defaultfloat << " - ";
 }
 
 template <int howmany, int n_str>
diff --git a/include/solverFP.h b/include/solverFP.h
index 67b99a4..6a00691 100644
--- a/include/solverFP.h
+++ b/include/solverFP.h
@@ -32,8 +32,7 @@ SolverFP<howmany, n_str>::SolverFP(Reader &reader, MaterialManager<howmany, n_st
 template <int howmany, int n_str>
 void SolverFP<howmany, n_str>::internalSolve()
 {
-    if (this->world_rank == 0)
-        printf("\n# Start FANS - Fixed Point Solver \n");
+    Log::solver->info() << "\n# Start FANS - Fixed Point Solver \n";
 
     this->template compute_residual<2>(v_r_real, v_u_real);
 
@@ -50,7 +49,7 @@ void SolverFP<howmany, n_str>::internalSolve()
         iter++;
         err_rel = this->compute_error(v_r_real);
     }
-    if (this->world_rank == 0)
-        printf("# Complete FANS - Fixed Point Solver \n");
+
+    Log::solver->info() << "# Complete FANS - Fixed Point Solver \n";
 }
 #endif
diff --git a/pyfans/micro.cpp b/pyfans/micro.cpp
index b9f3636..c195ab1 100644
--- a/pyfans/micro.cpp
+++ b/pyfans/micro.cpp
@@ -37,6 +37,11 @@ MicroSimulation::MicroSimulation(int sim_id, char *input_file)
     solver     = createSolver<3, 6>(reader, matmanager);
 }
 
+MicroSimulation::~MicroSimulation()
+{
+    Log::finalize();
+}
+
 py::dict MicroSimulation::solve(py::dict macro_data, double dt)
 {
     // Time step value dt is not used currently, but is available for future use
diff --git a/pyfans/micro.hpp b/pyfans/micro.hpp
index b6ebc6c..a8c5bb2 100644
--- a/pyfans/micro.hpp
+++ b/pyfans/micro.hpp
@@ -20,6 +20,7 @@ namespace py = pybind11;
 class MicroSimulation {
   public:
     MicroSimulation(int sim_id, char *input_file = "input.json");
+    ~MicroSimulation();
     py::dict solve(py::dict macro_write_data, double dt);
 
   private:
diff --git a/src/logging.cpp b/src/logging.cpp
new file mode 100755
index 0000000..9a8a354
--- /dev/null
+++ b/src/logging.cpp
@@ -0,0 +1,147 @@
+#include "logging.h"
+
+#include <utility>
+
+#include "mpi.h"
+
+int active_rank = 0;
+
+Log::Logger::Logger(std::string prefix, int comm_rank, int comm_size) : _prefix(std::move(prefix)), _nullstr(), _comm_rank(comm_rank), _comm_size(comm_size) {
+    _start_time = std::chrono::steady_clock::now();
+    _nullstr.setstate(std::ios_base::badbit);
+}
+
+std::ostream &Log::Logger::error(bool append) {
+    if (active_level >= Error && _comm_rank == active_rank) {
+        if (append) return std::cout;
+        std::cout << _prefix;
+        auto now = std::chrono::steady_clock::now();
+        auto elapse_time = static_cast<double>(std::chrono::duration_cast<std::chrono::milliseconds>(now - _start_time).count()) / 1000.0;
+        std::cout << elapse_time << ": ";
+        return std::cout;
+    } else return _nullstr;
+}
+
+std::ostream &Log::Logger::info(bool append) {
+    if (active_level >= Info && _comm_rank == active_rank) {
+        if (append) return std::cout;
+        std::cout << _prefix;
+        auto now = std::chrono::steady_clock::now();
+        auto elapse_time = static_cast<double>(std::chrono::duration_cast<std::chrono::milliseconds>(now - _start_time).count()) / 1000.0;
+        std::cout << elapse_time << ": ";
+        return std::cout;
+    } else return _nullstr; 
+}
+
+std::ostream &Log::Logger::warn(bool append) {
+    if (active_level >= Warn && _comm_rank == active_rank) {
+        if (append) return std::cout;
+        std::cout << _prefix;
+        auto now = std::chrono::steady_clock::now();
+        auto elapse_time = static_cast<double>(std::chrono::duration_cast<std::chrono::milliseconds>(now - _start_time).count()) / 1000.0;
+        std::cout << elapse_time << ": ";
+        return std::cout;
+    } else return _nullstr; 
+}
+
+std::ostream &Log::Logger::debug(bool append) {
+    if (active_level >= Debug && _comm_rank == active_rank) {
+        if (append) return std::cout;
+        std::cout << _prefix;
+        auto now = std::chrono::steady_clock::now();
+        auto elapse_time = static_cast<double>(std::chrono::duration_cast<std::chrono::milliseconds>(now - _start_time).count()) / 1000.0;
+        std::cout << elapse_time << ": ";
+        return std::cout;
+    } else return _nullstr; 
+}
+
+std::ostream &Log::Logger::trace(bool append) {
+    if (active_level >= All && _comm_rank == active_rank) {
+        if (append) return std::cout;
+        std::cout << _prefix;
+        auto now = std::chrono::steady_clock::now();
+        auto elapse_time = static_cast<double>(std::chrono::duration_cast<std::chrono::milliseconds>(now - _start_time).count()) / 1000.0;
+        std::cout << elapse_time << ": ";
+        return std::cout;
+    } else return _nullstr; 
+}
+void Log::Logger::progress(const std::string &prefix, int step, int max) {
+    if (_comm_rank != active_rank) return;
+
+    if (step == 0) std::cout << _prefix << prefix;
+
+    int digits = 1;
+    int div = 10;
+    while ((max / div) > 0) {
+        digits++;
+        div *= 10;
+    }
+    if (step > 0) for (int i = 0; i < digits; i++) std::cout << '\b';
+
+    int step_digits = 1;
+    div = 10;
+    while ((step / div) > 0) {
+        step_digits++;
+        div *= 10;
+    }
+
+    for (int i = 0; i < digits - step_digits; i++) std::cout << ' ';
+    std::cout << step;
+
+    if (step >= max-1) {
+        for (int i = 0; i < digits; i++)
+            std::cout << '\b';
+        for (int i = 0; i < prefix.size(); i++)
+            std::cout << '\b';
+        for (int i = 0; i < _prefix.size(); i++)
+            std::cout << '\b';
+    }
+    //if (step >= max) std::cout << '\n';
+    std::cout.flush();
+}
+
+void Log::init(int comm_rank, int comm_size) {
+    Level level = Log::Error;
+    if constexpr (VERBOSITY <= 0) level = Error;
+    if constexpr (VERBOSITY == 1) level = Info;
+    if constexpr (VERBOSITY == 2) level = Warn;
+    if constexpr (VERBOSITY == 3) level = Debug;
+    if constexpr (VERBOSITY >= 4) level = All;
+
+    active_level = level;
+    general = std::make_unique<Logger>("[FANS-GENERAL] ", comm_rank, comm_size);
+    solver = std::make_unique<Logger>("[FANS-SOLVER] ", comm_rank, comm_size);
+    io = std::make_unique<Logger>("[FANS-IO] ", comm_rank, comm_size);
+}
+
+void Log::finalize() {
+    general = nullptr;
+    solver = nullptr;
+    io = nullptr;
+}
+
+Log::Level Log::active_level = Info;
+std::unique_ptr<Log::Logger> Log::general = nullptr;
+std::unique_ptr<Log::Logger> Log::solver = nullptr;
+std::unique_ptr<Log::Logger> Log::io = nullptr;
+
+const std::string Log::level_to_string(Log::Level level) {
+    switch (level) {
+    case Error:
+        return "Error";
+    case Info:
+        return "Info";
+    case Warn:
+        return "Warn";
+    case Debug:
+        return "Debug";
+    case All:
+        return "All";
+    }
+    return "";
+}
+
+void Log::setActiveRank(int rank) {
+    active_rank = rank;
+}
+
diff --git a/src/main.cpp b/src/main.cpp
index 6c69bf6..bcdce0b 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -4,6 +4,7 @@
 #include "solver.h"
 
 // Version
+#include "logging.h"
 #include "version.h"
 
 template <int howmany, int n_str>
@@ -15,17 +16,13 @@ void runSolver(Reader &reader)
         MaterialManager<howmany, n_str> *matmanager = createMaterialManager<howmany, n_str>(reader);
         Solver<howmany, n_str>          *solver     = createSolver(reader, matmanager);
 
-        if (reader.world_rank == 0) {
-            printf("\n╔════════════════════════════════════════════════════════════ Load case %zu/%zu: %zu time steps ════════════════════════════════════════════════════════════╗\n",
-                   load_path_idx + 1, reader.load_cases.size(), reader.load_cases[load_path_idx].n_steps);
-        }
+        Log::general->info() << "\n╔════════════════════════════════════════════════════════════ Load case "
+                             << load_path_idx + 1 << "/" << reader.load_cases.size() << ": " << reader.load_cases[load_path_idx].n_steps
+                             << " time steps ════════════════════════════════════════════════════════════╗\n";
 
         for (size_t time_step_idx = 0; time_step_idx < reader.load_cases[load_path_idx].n_steps; ++time_step_idx) {
-            if (reader.world_rank == 0) {
-                printf("║   ▶ Time step %zu/%zu (load case %zu/%zu) ◀ \n",
-                       time_step_idx + 1, reader.load_cases[load_path_idx].n_steps,
-                       load_path_idx + 1, reader.load_cases.size());
-            }
+            Log::general->info() << "║   ▶ Time step " << time_step_idx + 1 << "/" << reader.load_cases[load_path_idx].n_steps
+                                 << " (load case " << load_path_idx + 1 << "/" << reader.load_cases.size() << ") ◀ \n";
             if (reader.load_cases[load_path_idx].mixed) {
                 solver->enableMixedBC(reader.load_cases[load_path_idx].mbc, time_step_idx);
             } else {
@@ -37,9 +34,7 @@ void runSolver(Reader &reader)
         }
         delete solver;
         delete matmanager;
-        if (reader.world_rank == 0) {
-            printf("╚══════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════╝\n");
-        }
+        Log::general->info() << "╚══════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════╝\n";
     }
 }
 
@@ -73,6 +68,7 @@ int main(int argc, char *argv[])
     }
     reader.CloseResultsFile();
 
+    Log::finalize();
     MPI_Finalize();
     return 0;
 }
diff --git a/src/reader.cpp b/src/reader.cpp
index 288d4af..a10984f 100644
--- a/src/reader.cpp
+++ b/src/reader.cpp
@@ -1,5 +1,6 @@
 #include "general.h"
 #include "reader.h"
+#include "logging.h"
 
 #include "H5Cpp.h"
 #include "fftw3-mpi.h"
@@ -35,10 +36,8 @@ void Reader::ComputeVolumeFractions()
     // Calculate total number of materials
     n_mat = global_max - global_min + 1;
 
-    if (world_rank == 0) {
-        printf("# Number of materials: %i (from %u to %u)\n", n_mat, global_min, global_max);
-        printf("# Volume fractions\n");
-    }
+    Log::io->info() << "# Number of materials: " << n_mat << " (from " << global_min << " to " << global_max << ")\n";
+    Log::io->info() << "# Volume fractions\n";
 
     // Using dynamic allocation for arrays since we don't know size at compile time
     std::vector<long>   vol_frac(n_mat, 0);
@@ -54,9 +53,7 @@ void Reader::ComputeVolumeFractions()
         long vf;
         MPI_Allreduce(&(vol_frac[i]), &vf, 1, MPI_LONG, MPI_SUM, MPI_COMM_WORLD);
         v_frac[i] = double(vf) / double(dims[0] * dims[1] * dims[2]);
-        if (world_rank == 0)
-            printf("# material %4u    vol. frac. %10.4f%%  \n",
-                   static_cast<unsigned int>(i) + global_min, 100. * v_frac[i]);
+        Log::io->info() << "# material " << static_cast<unsigned int>(i) + global_min << "    vol. frac. " << std::setw(10) << std::setprecision(4) << 100. * v_frac[i] << std::defaultfloat << "%\n";
     }
 }
 
@@ -67,6 +64,8 @@ void Reader ::ReadInputFile(char input_fn[])
         MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
         MPI_Comm_size(MPI_COMM_WORLD, &world_size);
 
+        Log::init(world_rank, world_size);
+
         ifstream i(input_fn);
         json     j;
         i >> j;
@@ -148,22 +147,20 @@ void Reader ::ReadInputFile(char input_fn[])
             load_cases.push_back(std::move(lc));
         }
 
-        if (world_rank == 0) {
-            printf("# microstructure file name: \t '%s'\n", ms_filename);
-            printf("# microstructure dataset name: \t '%s'\n", ms_datasetname);
-            printf("# strain type: \t %s\n", strain_type.c_str());
-            printf("# problem type: \t %s\n", problemType.c_str());
-            printf("# FE type: \t %s\n", FE_type.c_str());
-            printf(
-                "# FANS error measure: \t %s %s error  \n",
-                errorParameters["type"].get<string>().c_str(),
-                errorParameters["measure"].get<string>().c_str());
-            printf("# FANS Tolerance: \t %10.5e\n", errorParameters["tolerance"].get<double>());
-            printf("# Max iterations: \t %6i\n", n_it);
-        }
+        Log::io->info() << "# microstructure file name: \t " << ms_filename << "\n";
+        Log::io->info() << "# microstructure dataset name: \t " << ms_datasetname <<"\n";
+        Log::io->info() << "# strain type: \t " << strain_type << "\n";
+        Log::io->info() << "# problem type: \t " << problemType << "\n";
+        Log::io->info() << "# FE type: \t " << FE_type << "\n";
+        Log::io->info() << "# FANS error measure: \t " <<
+            errorParameters["type"].get<string>() << " " <<
+            errorParameters["measure"].get<string>() << " error  \n";
+        Log::io->info() << "# FANS Tolerance: \t " << std::setw(10) << std::setprecision(5) << errorParameters["tolerance"].get<double>() << std::defaultfloat << "e\n";
+        Log::io->info() << "# Max iterations: \t " << n_it << "\n";
 
     } catch (const std::exception &e) {
-        fprintf(stderr, "ERROR trying to read input file '%s' for FANS\n", input_fn);
+        Log::io->error() << "ERROR trying to read input file '" << input_fn << "' for FANS\n";
+        Log::finalize();
         exit(10);
     }
 }
@@ -257,13 +254,8 @@ void Reader ::ReadMS(int hm)
         H5Aclose(attr_id);
         H5Tclose(attr_type);
     }
-    if (world_rank == 0) {
-        if (is_zyx) {
-            printf("# Using Z-Y-X dimension ordering for the microstructure data\n");
-        } else {
-            printf("# Using X-Y-Z dimension ordering for the microstructure data\n");
-        }
-    }
+    if (is_zyx) Log::io->info() << "# Using Z-Y-X dimension ordering for the microstructure data\n";
+    else Log::io->info() << "# Using X-Y-Z dimension ordering for the microstructure data\n";
 
     dims.resize(3);
     if (is_zyx) {           /* file layout Z Y X  -> logical X Y Z */
@@ -281,18 +273,20 @@ void Reader ::ReadMS(int hm)
     l_e[1] = L[1] / double(dims[1]);
     l_e[2] = L[2] / double(dims[2]);
 
-    if (world_rank == 0) {
-        printf("# grid size set to [%i x %i x %i] --> %i voxels \nMicrostructure length: [%3.6f x %3.6f x %3.6f]\n", dims[0], dims[1], dims[2], dims[0] * dims[1] * dims[2], L[0], L[1], L[2]);
-        if (dims[0] % 2 != 0)
-            fprintf(stderr, "[ FANS3D_Grid ] WARNING: n_x is not a multiple of 2\n");
-        if (dims[1] % 2 != 0)
-            fprintf(stderr, "[ FANS3D_Grid ] WARNING: n_y is not a multiple of 2\n");
-        if (dims[2] % 2 != 0)
-            fprintf(stderr, "[ FANS3D_Grid ] WARNING: n_z is not a multiple of 2\n");
-        if (dims[0] / 4 < world_size)
-            throw std::runtime_error("[ FANS3D_Grid ] ERROR: Please decrease the number of processes or increase the grid size to ensure that each process has at least 4 boxels in the x direction.");
-        printf("Voxel length: [%1.8f, %1.8f, %1.8f]\n", l_e[0], l_e[1], l_e[2]);
-    }
+    Log::io->info() << "# grid size set to [" << dims[0] << " x " << dims[1] << " x " << dims[2] << "] --> " << dims[0] * dims[1] * dims[2] << " voxels \n";
+    Log::io->info() << "Microstructure length: ["
+                    << std::setw(3) << std::setprecision(6) << L[0] << std::defaultfloat << " x "
+                    << std::setw(3) << std::setprecision(6) << L[1] << std::defaultfloat << " x "
+                    << std::setw(3) << std::setprecision(6) << L[2] << std::defaultfloat << "]\n";
+    if (dims[0] % 2 != 0) Log::io->warn() << "[ FANS3D_Grid ] WARNING: n_x is not a multiple of 2\n";
+    if (dims[1] % 2 != 0) Log::io->warn() << "[ FANS3D_Grid ] WARNING: n_y is not a multiple of 2\n";
+    if (dims[2] % 2 != 0) Log::io->warn() << "[ FANS3D_Grid ] WARNING: n_z is not a multiple of 2\n";
+    if (dims[0] / 4 < world_size)
+        throw std::runtime_error("[ FANS3D_Grid ] ERROR: Please decrease the number of processes or increase the grid size to ensure that each process has at least 4 boxels in the x direction.");
+    Log::io->info() << "Voxel length: ["
+                    << std::setw(1) << std::setprecision(8) << l_e[0] << std::defaultfloat << ", "
+                    << std::setw(1) << std::setprecision(8) << l_e[1] << std::defaultfloat << ", "
+                    << std::setw(1) << std::setprecision(8) << l_e[2] << std::defaultfloat << "]\n";
 
     const ptrdiff_t n[3]   = {dims[0], dims[1], dims[2] / 2 + 1};
     ptrdiff_t       block0 = FFTW_MPI_DEFAULT_BLOCK;

From a10c32959122a49cd8a41c349840ea5a2caf5b43 Mon Sep 17 00:00:00 2001
From: Alex Hocks <alex.hocks@tum.de>
Date: Sat, 3 Jan 2026 17:20:06 +0100
Subject: [PATCH 12/14] small optim

---
 include/MaterialManager.h |  2 +-
 include/matmodel.h        | 10 ++++++----
 pyfans/micro.cpp          |  8 +++++---
 pyfans/micro.hpp          |  2 ++
 4 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/include/MaterialManager.h b/include/MaterialManager.h
index e1d7469..6a9b5ef 100644
--- a/include/MaterialManager.h
+++ b/include/MaterialManager.h
@@ -210,7 +210,7 @@ class MaterialManager {
     }
 
     // Set macroscale loading gradient for all models
-    void set_gradient(vector<double> g0)
+    void set_gradient(const vector<double>& g0)
     {
         for (auto *model : models) {
             model->setGradient(g0);
diff --git a/include/matmodel.h b/include/matmodel.h
index bd3a82d..a43087f 100644
--- a/include/matmodel.h
+++ b/include/matmodel.h
@@ -27,7 +27,7 @@ class Matmodel {
     Matrix<double, howmany * 8, howmany * 8> Compute_Reference_ElementStiffness(const Matrix<double, n_str, n_str> &kapparef_mat);
     Matrix<double, howmany * 8, 1>          &element_residual(Matrix<double, howmany * 8, 1> &ue, int mat_index, ptrdiff_t element_idx);
     void                                     getStrainStress(double *strain, double *stress, Matrix<double, howmany * 8, 1> &ue, int mat_index, ptrdiff_t element_idx);
-    void                                     setGradient(vector<double> _g0);
+    void                                     setGradient(const vector<double> &_g0);
 
     // Accessors for internal Gauss point data (populated after getStrainStress call)
     inline const double *get_eps_data() const
@@ -225,11 +225,13 @@ void Matmodel<howmany, n_str>::getStrainStress(double *strain, double *stress, M
 }
 
 template <int howmany, int n_str>
-void Matmodel<howmany, n_str>::setGradient(vector<double> _g0)
+void Matmodel<howmany, n_str>::setGradient(const vector<double> &_g0)
 {
+    // _g0 is smaller than g0, can stay in cache
+    // if j loop inner, then not using cache-lines well
     macroscale_loading = _g0;
-    for (int i = 0; i < n_str; i++) {
-        for (int j = 0; j < n_gp; ++j) {
+    for (int j = 0; j < n_gp; ++j) {
+        for (int i = 0; i < n_str; i++) {
             g0(n_str * j + i, 0) = _g0[i];
         }
     }
diff --git a/pyfans/micro.cpp b/pyfans/micro.cpp
index c195ab1..eb45ff2 100644
--- a/pyfans/micro.cpp
+++ b/pyfans/micro.cpp
@@ -35,6 +35,8 @@ MicroSimulation::MicroSimulation(int sim_id, char *input_file)
     reader.ReadMS(3);
     matmanager = createMaterialManager<3, 6>(reader);
     solver     = createSolver<3, 6>(reader, matmanager);
+
+    g0.resize(6, 0);
 }
 
 MicroSimulation::~MicroSimulation()
@@ -51,17 +53,17 @@ py::dict MicroSimulation::solve(py::dict macro_data, double dt)
     py::array_t<double> strain2 = macro_data["strains4to6"].cast<py::array_t<double>>();
 
     py::array_t<double> strain = merge_arrays(strain1, strain2);
-    std::vector<double> g0     = std::vector<double>(strain.data(), strain.data() + strain.size()); // convert numpy array to std::vector.
+    std::copy(strain.begin(), strain.end(), g0.begin()); // convert numpy array to std::vector.
 
     VectorXd homogenized_stress;
 
-    matmanager->set_gradient(g0);
+    matmanager->set_gradient(g0); // TODO done
 
     solver->solve();
 
     homogenized_stress = solver->get_homogenized_stress();
 
-    auto C = solver->get_homogenized_tangent(pert_param);
+    MatrixXd C = solver->get_homogenized_tangent(pert_param);
 
     // Convert data to a py::dict again to send it back to the Micro Manager
     py::dict micro_write_data;
diff --git a/pyfans/micro.hpp b/pyfans/micro.hpp
index a8c5bb2..51ce762 100644
--- a/pyfans/micro.hpp
+++ b/pyfans/micro.hpp
@@ -30,4 +30,6 @@ class MicroSimulation {
     MaterialManager<3, 6> *matmanager;
     Solver<3, 6>          *solver;
     double                 pert_param = 1e-6; // scalar strain perturbation parameter
+
+    std::vector<double>    g0; // gradient buffer, alloc once
 };

From 4bb23c14dc580ed02a15395f4686f1b5a7d782f1 Mon Sep 17 00:00:00 2001
From: Alex Hocks <alex.hocks@tum.de>
Date: Sat, 3 Jan 2026 17:43:11 +0100
Subject: [PATCH 13/14] fix

---
 pyfans/micro.cpp | 3 +--
 pyfans/micro.hpp | 2 --
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/pyfans/micro.cpp b/pyfans/micro.cpp
index eb45ff2..65f8b8b 100644
--- a/pyfans/micro.cpp
+++ b/pyfans/micro.cpp
@@ -36,7 +36,6 @@ MicroSimulation::MicroSimulation(int sim_id, char *input_file)
     matmanager = createMaterialManager<3, 6>(reader);
     solver     = createSolver<3, 6>(reader, matmanager);
 
-    g0.resize(6, 0);
 }
 
 MicroSimulation::~MicroSimulation()
@@ -53,7 +52,7 @@ py::dict MicroSimulation::solve(py::dict macro_data, double dt)
     py::array_t<double> strain2 = macro_data["strains4to6"].cast<py::array_t<double>>();
 
     py::array_t<double> strain = merge_arrays(strain1, strain2);
-    std::copy(strain.begin(), strain.end(), g0.begin()); // convert numpy array to std::vector.
+    std::vector<double> g0     = std::vector<double>(strain.data(), strain.data() + strain.size()); // convert numpy array to std::vector.
 
     VectorXd homogenized_stress;
 
diff --git a/pyfans/micro.hpp b/pyfans/micro.hpp
index 51ce762..a8c5bb2 100644
--- a/pyfans/micro.hpp
+++ b/pyfans/micro.hpp
@@ -30,6 +30,4 @@ class MicroSimulation {
     MaterialManager<3, 6> *matmanager;
     Solver<3, 6>          *solver;
     double                 pert_param = 1e-6; // scalar strain perturbation parameter
-
-    std::vector<double>    g0; // gradient buffer, alloc once
 };

From 9e751eb11088594e3fdb192fd240bc7c03a5a9b1 Mon Sep 17 00:00:00 2001
From: Alex Hocks <alex.hocks@tum.de>
Date: Mon, 12 Jan 2026 20:26:54 +0100
Subject: [PATCH 14/14] Add rank sync on trace log and impl serialization

---
 CMakeLists.txt                            |   1 +
 include/MaterialManager.h                 |  17 ++-
 include/logging.h                         |  36 ++++-
 include/material_models/J2Plasticity.h    |  11 ++
 include/material_models/J2PlasticityNew.h |   9 ++
 include/matmodel.h                        |  37 ++++-
 include/reader.h                          |  20 ++-
 include/serialization.h                   | 164 ++++++++++++++++++++++
 include/solver.h                          |  39 ++++-
 pyfans/micro.cpp                          | 104 +++++++++++---
 pyfans/micro.hpp                          |   8 +-
 src/logging.cpp                           |  43 ++++--
 src/reader.cpp                            |  94 ++++++++++++-
 13 files changed, 539 insertions(+), 44 deletions(-)
 create mode 100644 include/serialization.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index efb3c1f..91523d6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -162,6 +162,7 @@ set_property(TARGET FANS_FANS PROPERTY PUBLIC_HEADER
         include/solver.h
         include/setup.h
         include/mixedBCs.h
+        include/serialization.h
 
         include/material_models/LinearThermal.h
         include/material_models/GBDiffusion.h
diff --git a/include/MaterialManager.h b/include/MaterialManager.h
index 6a9b5ef..a905c97 100644
--- a/include/MaterialManager.h
+++ b/include/MaterialManager.h
@@ -4,6 +4,7 @@
 #include "general.h"
 #include "logging.h"
 #include "matmodel.h"
+#include "serialization.h"
 
 template <int howmany, int n_str>
 Matmodel<howmany, n_str> *createMatmodel(const Reader &reader);
@@ -26,7 +27,7 @@ struct MaterialInfo {
  *
  */
 template <int howmany, int n_str>
-class MaterialManager {
+class MaterialManager : public Serializable {
   private:
     MaterialInfo<howmany, n_str> *phase_to_info{nullptr}; // [n_phases] - HOT DATA
     int                           n_phases;
@@ -201,6 +202,20 @@ class MaterialManager {
         }
     }
 
+    void register_serialization(registry_t &r) override
+    {
+        for (auto *model : models) {
+            model->register_serialization(r);
+        }
+    }
+
+    void init_deserialization() override
+    {
+        for (auto *model : models) {
+            model->init_deserialization();
+        }
+    }
+
     // Update internal variables after converged time step
     void update_internal_variables()
     {
diff --git a/include/logging.h b/include/logging.h
index 871ed0a..734a669 100755
--- a/include/logging.h
+++ b/include/logging.h
@@ -4,7 +4,9 @@
 #include <chrono>
 #include <fstream>
 #include <iostream>
+#include <sstream>
 #include <memory>
+#include "mpi.h"
 
 namespace Log {
 
@@ -20,9 +22,33 @@ enum Level {
 
 extern Level active_level;
 
+class Logger;
+class MPI_TraceSync {
+  public:
+    explicit MPI_TraceSync(Logger &log, bool append);
+    ~MPI_TraceSync();
+
+    template <class T>
+    MPI_TraceSync &operator<<(const T &v);
+    MPI_TraceSync &operator<<(std::ostream &(*m)(std::ostream &) );
+
+  private:
+    Logger            &_log;
+    bool               _append;
+    std::ostringstream _buffer;
+};
+
+template <class T>
+MPI_TraceSync &MPI_TraceSync::operator<<(const T &v)
+{
+    _buffer << v;
+    return *this;
+}
+
 class Logger {
   public:
-    explicit Logger(std::string prefix, int comm_rank, int comm_size);
+    friend class MPI_TraceSync;
+    explicit Logger(std::string prefix, int comm_rank, int comm_size, const MPI_Comm& comm);
 
     /// error > info > warn > debug > trace
     std::ostream& error(bool append=false);
@@ -33,11 +59,12 @@ class Logger {
     /// error > info > warn > debug > trace
     std::ostream& debug(bool append=false);
     /// error > info > warn > debug > trace
-    std::ostream& trace(bool append=false);
+    MPI_TraceSync trace(bool append=false);
     /// progress bar
     void progress(const std::string& prefix, int step, int max);
 
   private:
+    std::ostream& trace_impl(bool append=false);
     /// starting time
     std::chrono::steady_clock::time_point _start_time;
     /// what the logger should always print first
@@ -48,13 +75,14 @@ class Logger {
     int _comm_rank;
     /// MPI comm size
     int _comm_size;
-
+    /// communicator
+    MPI_Comm _comm;
 };
 
 /**
  * Creates all loggers and sets the level
  * */
-void init(int comm_rank, int comm_size);
+void init(int comm_rank, int comm_size, const MPI_Comm& comm);
 
 /**
  * Frees all memory
diff --git a/include/material_models/J2Plasticity.h b/include/material_models/J2Plasticity.h
index 33f2a69..e657a20 100644
--- a/include/material_models/J2Plasticity.h
+++ b/include/material_models/J2Plasticity.h
@@ -55,6 +55,17 @@ class J2Plasticity : public SmallStrainMechModel {
         psi_bar_t.resize(num_elements, Matrix<double, 6, Dynamic>::Zero(6, num_gauss_points));
     }
 
+    void register_serialization(registry_t &r) override
+    {
+        SmallStrainMechModel::register_serialization(r);
+        for (auto &mat : plasticStrain_t) r.emplace_back(std::data(mat), (mat.size()) * sizeof(double), true);
+        for (auto &mat : plasticStrain) r.emplace_back(std::data(mat), (mat.size()) * sizeof(double), true);
+        for (auto &mat : psi_bar_t) r.emplace_back(std::data(mat), (mat.size()) * sizeof(double), true);
+        for (auto &mat : psi_bar) r.emplace_back(std::data(mat), (mat.size()) * sizeof(double), true);
+        for (auto &vec : psi_t) r.emplace_back(std::data(vec), (vec.size()) * sizeof(double), true);
+        for (auto &vec : psi) r.emplace_back(std::data(vec), (vec.size()) * sizeof(double), true);
+    }
+
     virtual void updateInternalVariables() override
     {
         plasticStrain_t = plasticStrain;
diff --git a/include/material_models/J2PlasticityNew.h b/include/material_models/J2PlasticityNew.h
index e62996e..3f0e26c 100644
--- a/include/material_models/J2PlasticityNew.h
+++ b/include/material_models/J2PlasticityNew.h
@@ -34,6 +34,15 @@ class J2PlasticityNew : public SmallStrainMechModel {
         q_t.resize(num_elements, VectorXd::Zero(num_gauss_points));
     }
 
+    void register_serialization(registry_t &r) override
+    {
+        SmallStrainMechModel::register_serialization(r);
+        for (auto &mat : plasticStrain_t) r.emplace_back(std::data(mat), (mat.size()) * sizeof(double), true);
+        for (auto &mat : plasticStrain) r.emplace_back(std::data(mat), (mat.size()) * sizeof(double), true);
+        for (auto &vec : q_t) r.emplace_back(std::data(vec), (vec.size()) * sizeof(double), true);
+        for (auto &vec : q) r.emplace_back(std::data(vec), (vec.size()) * sizeof(double), true);
+    }
+
     virtual void updateInternalVariables() override
     {
         plasticStrain_t = plasticStrain;
diff --git a/include/matmodel.h b/include/matmodel.h
index a43087f..ffdb1bc 100644
--- a/include/matmodel.h
+++ b/include/matmodel.h
@@ -3,12 +3,13 @@
 #define MATMODEL_H
 
 #include "general.h"
+#include "serialization.h"
 
 template <int howmany, int n_str>
 class Solver;
 
 template <int howmany, int n_str>
-class Matmodel {
+class Matmodel : public Serializable {
   public:
     EIGEN_MAKE_ALIGNED_OPERATOR_NEW // see http://eigen.tuxfamily.org/dox-devel/group__TopicStructHavingEigenMembers.html
 
@@ -53,6 +54,9 @@ class Matmodel {
 
     virtual ~Matmodel() = default;
 
+    void        register_serialization(registry_t &r) override;
+    void        init_deserialization() override;
+
   protected:
     double l_e_x;
     double l_e_y;
@@ -101,6 +105,37 @@ Matmodel<howmany, n_str>::Matmodel(const Reader &reader)
     sigma.resize(n_str * n_gp);
 }
 
+template <int howmany, int n_str>
+void Matmodel<howmany, n_str>::register_serialization(registry_t &r)
+{
+    //r.emplace_back(&verbosity, sizeof(int), false);
+    //r.emplace_back(&n_mat,     sizeof(int), false);
+    //r.emplace_back(&n_gp,      sizeof(int), false);
+
+    //r.emplace_back(std::data(FE_type), (FE_type.size() + 1) * sizeof(char), true);
+    //r.emplace_back(std::data(macroscale_loading), macroscale_loading.size() * sizeof(double), true);
+
+    //r.emplace_back(&l_e_x,      sizeof(double), false);
+    //r.emplace_back(&l_e_y,      sizeof(double), false);
+    //r.emplace_back(&l_e_z,      sizeof(double), false);
+    //r.emplace_back(&v_e,        sizeof(double), false);
+
+    //for (auto& mat : B_int) r.emplace_back(std::data(mat), mat.size() * sizeof(double), true);
+
+    r.emplace_back(std::data(B),      B.size()     * sizeof(double), true);
+    r.emplace_back(std::data(eps),    eps.size()   * sizeof(double), true);
+    r.emplace_back(std::data(g0),     g0.size()    * sizeof(double), true);
+    r.emplace_back(std::data(sigma),  sigma.size() * sizeof(double), true);
+    r.emplace_back(std::data(res_e),  res_e.size() * sizeof(double), true);
+}
+
+template <int howmany, int n_str>
+void Matmodel<howmany, n_str>::init_deserialization()
+{
+    // need to allocate enough memory first
+    //FE_type = std::string("\0\0\0\0\0\0");
+}
+
 template <int howmany, int n_str>
 void Matmodel<howmany, n_str>::Construct_B()
 {
diff --git a/include/reader.h b/include/reader.h
index ef6a87a..e720704 100644
--- a/include/reader.h
+++ b/include/reader.h
@@ -11,12 +11,21 @@
 #include "H5FDmpio.h"
 #include "mpi.h"
 
+#include "serialization.h"
+
 using namespace std;
 
-class Reader {
+class Reader : public Serializable {
   public:
-    // Default constructor
-    Reader() = default;
+    // Explicit default constructor
+    Reader():
+    force_single_rank(false), communicator(nullptr), n_mat(0),
+    TOL(0), n_it(0), world_rank(0), world_size(0), alloc_local(0),
+    local_n0(0), local_0_start(0), local_n1(0), local_1_start(0)
+    {
+        manual_serialize   = true;
+        manual_deserialize = true;
+    }
 
     // Destructor to free allocated memory
     ~Reader();
@@ -65,8 +74,11 @@ class Reader {
 
     // void Setup(ptrdiff_t howmany);
     void ReadInputFile(char input_fn[]);
+    void ReadJson(json j);
     void ReadMS(int hm);
-    void ComputeVolumeFractions();
+    void     ComputeVolumeFractions();
+    length_t serialize_override(buffer_t &buffer, length_t offset) override;
+    length_t deserialize_override(buffer_t &buffer, length_t offset) override;
     // void ReadHDF5(char file_name[], char dset_name[]);
     void safe_create_group(hid_t file, const char *const name);
     void OpenResultsFile(const char *output_fn); // Open results file once
diff --git a/include/serialization.h b/include/serialization.h
new file mode 100644
index 0000000..4771241
--- /dev/null
+++ b/include/serialization.h
@@ -0,0 +1,164 @@
+//
+// Created by Alex Hocks on 09/01/26.
+//
+
+#ifndef SERIALIZATION_H
+#define SERIALIZATION_H
+#include "logging.h"
+
+#include <cstdint>
+#include <vector>
+#include <type_traits>
+
+template<typename T, typename Base = std::vector<T>, typename r = std::conditional_t<sizeof(T)<=4, std::true_type, void>>
+class SerializationBuffer : public Base {
+    // Buffer always has size multiple of 4 byte
+public:
+    using Base::Base;
+    using size_type = typename Base::size_type;
+    void resize(size_type new_size)
+    {
+        const auto num_bytes = new_size * data_size;
+        const auto delta_bytes = (4 - (num_bytes % 4)) % 4;
+        new_size += delta_bytes / data_size;
+        Base::resize(new_size);
+    }
+
+    void resize(size_type new_size, const T& value)
+    {
+        const auto num_bytes = new_size * data_size;
+        const auto delta_bytes = (4 - (num_bytes % 4)) % 4;
+        new_size += delta_bytes / data_size;
+        Base::resize(new_size, value);
+    }
+
+    void add_size(size_type delta) { resize(Base::size() + delta); }
+
+    std::vector<int>& as_int_vector() { return *reinterpret_cast<std::vector<int>*>(this); }
+
+    static SerializationBuffer& from_int_vector(std::vector<int>& vec) { return *reinterpret_cast<SerializationBuffer*>(&vec); }
+
+private:
+    static constexpr auto data_size = sizeof(T);
+};
+
+class Serializable {
+    /**
+     * Serializes any object that implements this.
+     * Buffer elements will be bytes, but total count will always be padded to be multiple of 4.
+     * Data is run length encoded.
+     */
+public:
+    // run length type
+    using length_t = uint64_t;
+    // run length block size
+    static constexpr auto header_len = sizeof(length_t);
+    // byte data type
+    using data_t   = uint8_t;
+    // buffer type
+    using buffer_t = SerializationBuffer<data_t>;
+    // registry type: pointer to data, data size, true: run length encoded, false: direct read write
+    using reg_entry_t = std::tuple<void*, length_t, bool>;
+    using registry_t = std::vector<reg_entry_t>;
+
+    virtual ~Serializable() = default;
+
+    /**
+     * Serializes this object
+     * @return serial buffer
+     */
+    buffer_t serialize_full()
+    {
+        if (manual_deserialize) {
+            buffer_t buffer;
+            serialize_override(buffer, 0);
+            return buffer;
+        }
+
+        Log::io->trace() << "Serialization::serialize_full() - register_serialization\n";
+        registry_t registry;
+        register_serialization(registry);
+
+        Log::io->trace() << "Serialization::serialize_full() - accumulate\n";
+        const length_t total_size = std::accumulate(registry.begin(), registry.end(), 0UL,
+            [](const length_t acc, const reg_entry_t &e) {
+            const bool use_run_length = std::get<2>(e);
+            length_t size = std::get<1>(e);
+            if (use_run_length) size += header_len;
+            return acc + size;
+        });
+
+        buffer_t buffer;
+        buffer.resize(total_size);
+        Log::io->trace() << "Serialization::serialize_full() - buffer size=" << total_size << " addr=" << static_cast<void*>(std::data(buffer)) << "\n";
+
+        Log::io->trace() << "Serialization::serialize_full() - process entries\n";
+        length_t offset = 0UL;
+        length_t pos = 0UL;
+        for (const reg_entry_t &e : registry) {
+            const bool use_run_length = std::get<2>(e);
+            length_t size             = std::get<1>(e);
+            void* data                = std::get<0>(e);
+            Log::io->trace() << "Serialization::serialize_full() - entry: pos=" << pos << " addr=" << data << " offset=" << offset << " useRL=" << use_run_length << " size=" << size << "\n";
+
+            if (use_run_length) {
+                *reinterpret_cast<length_t*>(std::data(buffer) + offset) = size;
+                offset += header_len;
+            }
+            std::memcpy(std::data(buffer) + offset, data, size);
+            offset += size;
+            pos++;
+        }
+
+        Log::io->trace() << "Serialization::serialize_full() - done\n";
+        return buffer;
+    }
+
+    virtual void register_serialization(registry_t &r) {}
+
+    virtual void init_deserialization() {}
+
+    virtual registry_t::iterator late_init_deserialization(registry_t &r, registry_t::iterator begin) { return r.begin(); }
+
+    bool manual_deserialize = false;
+    bool manual_serialize = false;
+
+    /**
+     * Deserialize this object using provided data
+     * @param buffer
+     */
+    void deserialize_full(buffer_t &buffer)
+    {
+        if (manual_deserialize) {
+            deserialize_override(buffer, 0);
+            return;
+        }
+
+        init_deserialization(); // alloc mem if needed
+
+        // gather serialization description
+        registry_t registry;
+        register_serialization(registry);
+        // place correct lengths into descriptor list, etc
+        late_init_deserialization(registry, registry.begin());
+
+        length_t offset = 0UL;
+        for (reg_entry_t &e : registry) {
+            const bool use_run_length = std::get<2>(e);
+            length_t &size            = std::get<1>(e);
+            void* data                = std::get<0>(e);
+
+            if (use_run_length) {
+                size = *reinterpret_cast<const length_t*>(std::data(buffer) + offset);
+                offset += header_len;
+            }
+            std::memcpy(data, std::data(buffer) + offset, size);
+            offset += size;
+        }
+    }
+
+    virtual length_t deserialize_override(buffer_t &buffer, length_t offset) { return offset; }
+    virtual length_t serialize_override(buffer_t &buffer, length_t offset) { return offset; }
+};
+
+#endif
diff --git a/include/solver.h b/include/solver.h
index 82539de..6cd7c37 100644
--- a/include/solver.h
+++ b/include/solver.h
@@ -4,15 +4,16 @@
 #include "matmodel.h"
 #include "MaterialManager.h"
 #include "logging.h"
+#include "serialization.h"
 
 class J2Plasticity;
 
 typedef Map<Array<double, Dynamic, Dynamic>, Unaligned, OuterStride<>> RealArray;
 
 template <int howmany, int n_str>
-class Solver : private MixedBCController<howmany> {
+class Solver : MixedBCController<howmany>, public Serializable {
   public:
-    Solver(Reader &reader, MaterialManager<howmany, n_str> *matmanager);
+    explicit Solver(Reader &reader, MaterialManager<howmany, n_str> *matmanager=nullptr);
     virtual ~Solver();
 
     Reader &reader;
@@ -45,6 +46,10 @@ class Solver : private MixedBCController<howmany> {
 
     ArrayXd                          err_all; //!< Absolute error history
     Matrix<double, howmany, Dynamic> fundamentalSolution;
+    void init_fundamentalSolutionBuffer()
+    {
+        fundamentalSolution = Matrix<double, howmany, Dynamic>(howmany, (local_n1 * n_x * (n_z / 2 + 1) * (howmany + 1)) / 2);
+    }
 
     template <int padding, typename F>
     void iterateCubes(F f);
@@ -64,6 +69,8 @@ class Solver : private MixedBCController<howmany> {
     double compute_error(RealArray &r);
     void   CreateFFTWPlans(double *in, fftw_complex *transformed, double *out);
 
+    void register_serialization(registry_t &r) override;
+
     VectorXd homogenized_stress;
     VectorXd get_homogenized_stress();
 
@@ -133,15 +140,19 @@ Solver<howmany, n_str>::Solver(Reader &reader, MaterialManager<howmany, n_str> *
       rhat((std::complex<double> *) v_r, local_n1 * n_x * (n_z / 2 + 1) * howmany), // actual initialization is below
       buffer_padding(fftw_alloc_real(n_y * (n_z + 2) * howmany))
 {
+    Log::solver->trace() << "Start constructing solver\n";
+
     v_u_real.setZero();
     for (ptrdiff_t i = local_n0 * n_y * n_z * howmany; i < (local_n0 + 1) * n_y * n_z * howmany; i++) {
         this->v_u[i] = 0;
     }
     std::memset(v_u_prev, 0, local_n0 * n_y * n_z * howmany * sizeof(double));
 
-    matmanager->initialize_internal_variables(local_n0 * n_y * n_z, matmanager->models[0]->n_gp);
-
-    computeFundamentalSolution();
+    if (matmanager != nullptr) {
+        Log::solver->trace() << "Init internal vars.\n";
+        matmanager->initialize_internal_variables(local_n0 * n_y * n_z, matmanager->models[0]->n_gp);
+        computeFundamentalSolution();
+    }
 }
 
 template <int howmany, int n_str>
@@ -162,7 +173,7 @@ void Solver<howmany, n_str>::computeFundamentalSolution()
     Matrix<complex<double>, 8, 1>    A;
     Matrix<double, 8, 8>             AA;
     Matrix<double, howmany, howmany> block;
-    fundamentalSolution = Matrix<double, howmany, Dynamic>(howmany, (local_n1 * n_x * (n_z / 2 + 1) * (howmany + 1)) / 2);
+    init_fundamentalSolutionBuffer();
     fundamentalSolution.setZero();
 
     for (int i_y = 0; i_y < local_n1; ++i_y) {
@@ -206,6 +217,7 @@ void Solver<howmany, n_str>::computeFundamentalSolution()
 template <int howmany, int n_str>
 void Solver<howmany, n_str>::CreateFFTWPlans(double *in, fftw_complex *transformed, double *out)
 {
+    Log::solver->trace() << "Solver::CreateFFTWPlans() begin \n";
     int       rank   = 3;
     ptrdiff_t iblock = FFTW_MPI_DEFAULT_BLOCK;
     ptrdiff_t oblock = FFTW_MPI_DEFAULT_BLOCK;
@@ -225,6 +237,21 @@ void Solver<howmany, n_str>::CreateFFTWPlans(double *in, fftw_complex *transform
     new (&rhat) Map<VectorXcd>((std::complex<double> *) transformed, local_n1 * n_x * (n_z / 2 + 1) * howmany);
 }
 
+template <int howmany, int n_str>
+void Solver<howmany, n_str>::register_serialization(registry_t &r)
+{
+    Log::io->trace() << "Solver::register_serialization v_r size=" << std::max(reader.alloc_local * 2, (local_n0 + 1) * n_y * (n_z + 2) * howmany) * sizeof(double) << "\n";
+    r.emplace_back(v_r, std::max(reader.alloc_local * 2, (local_n0 + 1) * n_y * (n_z + 2) * howmany) * sizeof(double), true);
+    Log::io->trace() << "Solver::register_serialization v_u size=" << (local_n0 * n_y * n_z * howmany) * sizeof(double) << "\n";
+    r.emplace_back(v_u, (local_n0 * n_y * n_z * howmany) * sizeof(double), true);
+    Log::io->trace() << "Solver::register_serialization v_u_prev size=" << (local_n0 * n_y * n_z * howmany) * sizeof(double) << "\n";
+    r.emplace_back(v_u_prev, (local_n0 * n_y * n_z * howmany) * sizeof(double), true);
+    Log::io->trace() << "Solver::register_serialization rhat size=" << (local_n1 * n_x * (n_z / 2 + 1) * howmany) * sizeof(std::complex<double>) << "\n";
+    r.emplace_back(std::data(rhat), (local_n1 * n_x * (n_z / 2 + 1) * howmany) * sizeof(std::complex<double>), true);
+    Log::io->trace() << "Solver::register_serialization fundamentalSolution size=" << ((local_n1 * n_x * (n_z / 2 + 1) * (howmany + 1)) / 2) * sizeof(double) << "\n";
+    r.emplace_back(std::data(fundamentalSolution), ((local_n1 * n_x * (n_z / 2 + 1) * (howmany + 1)) / 2) * sizeof(double), true);
+}
+
 // TODO: possibly circumvent the padding problem by accessing r as a matrix?
 template <int howmany, int n_str>
 template <int padding, typename F>
diff --git a/pyfans/micro.cpp b/pyfans/micro.cpp
index 81c164a..4e82f22 100644
--- a/pyfans/micro.cpp
+++ b/pyfans/micro.cpp
@@ -25,25 +25,23 @@ py::array_t<double> merge_arrays(py::array_t<double> array1, py::array_t<double>
     return result;
 }
 
-MicroSimulation::MicroSimulation(int sim_id, char *input_file)
+MicroSimulation::MicroSimulation(int sim_id, bool late_init, char *input_file) : _sim_id(sim_id)
 {
     // initialize fftw mpi
     fftw_mpi_init();
 
-    // Input file name is hardcoded. TODO: Make it configurable
-    reader.ReadInputFile(input_file);
-
-    reader.force_single_rank = true;
-    reader.communicator      = MPI_COMM_SELF;
-
-    reader.ReadMS(3);
-
-    if (reader.strain_type == "small") {
-        matmanager = createMaterialManager<3, 6>(reader);
-        solver     = createSolver<3, 6>(reader, std::get<MaterialManager<3, 6> *>(matmanager));
-    } else {
-        matmanager = createMaterialManager<3, 9>(reader);
-        solver     = createSolver<3, 9>(reader, std::get<MaterialManager<3, 9> *>(matmanager));
+    if (not late_init || sim_id >= 0) {
+        // Input file name is hardcoded. TODO: Make it configurable
+        reader.ReadInputFile(input_file);
+        reader.ReadMS(3);
+
+        if (reader.strain_type == "small") {
+            matmanager = createMaterialManager<3, 6>(reader);
+            solver     = createSolver<3, 6>(reader, std::get<MaterialManager<3, 6> *>(matmanager));
+        } else {
+            matmanager = createMaterialManager<3, 9>(reader);
+            solver     = createSolver<3, 9>(reader, std::get<MaterialManager<3, 9> *>(matmanager));
+        }
     }
 }
 
@@ -114,6 +112,63 @@ py::dict MicroSimulation::solve(py::dict macro_data, double dt)
     return micro_write_data;
 }
 
+py::dict MicroSimulation::get_state()
+{
+    py::dict state;
+
+    Log::io->debug() << "Serializing reader\n";
+    Serializable::buffer_t reader_state = reader.serialize_full();
+    state["reader_state"] = reader_state.as_int_vector();
+    Log::io->debug() << "Serializing solver\n";
+    Serializable::buffer_t solver_state = std::visit([](auto &s) { return s->serialize_full(); }, solver);
+    state["solver_state"] = solver_state.as_int_vector();
+    Log::io->debug() << "Serializing material manager\n";
+    Serializable::buffer_t matmngr_state = std::visit([](auto &m) { return m->serialize_full(); }, matmanager);
+    state["matmanager_state"] = matmngr_state.as_int_vector();
+    Log::io->debug() << "Serializing micro sim done\n";
+
+    return state;
+}
+
+void MicroSimulation::set_state(py::dict &state)
+{
+    auto py_reader_state = state["reader_state"].cast<py::array_t<int>>();
+    std::vector<int> reader_state_i(py_reader_state.data(), py_reader_state.data() + py_reader_state.size());
+    Serializable::buffer_t &reader_state = Serializable::buffer_t::from_int_vector(reader_state_i);
+
+    auto py_solver_state = state["solver_state"].cast<py::array_t<int>>();
+    std::vector<int> solver_state_i(py_solver_state.data(), py_solver_state.data() + py_solver_state.size());
+    Serializable::buffer_t &solver_state = Serializable::buffer_t::from_int_vector(solver_state_i);
+
+    auto py_matmngr_state = state["matmanager_state"].cast<py::array_t<int>>();
+    std::vector<int> matmngr_state_i(py_matmngr_state.data(), py_matmngr_state.data() + py_matmngr_state.size());
+    Serializable::buffer_t &matmngr_state = Serializable::buffer_t::from_int_vector(matmngr_state_i);
+
+    reader.deserialize_full(reader_state);
+    if (reader.strain_type == "small") {
+        auto *mat_ptr = createMaterialManager<3, 6>(reader);
+        mat_ptr->deserialize_full(matmngr_state);
+        matmanager = mat_ptr;
+        auto *sol_ptr = createSolver<3, 6>(reader, nullptr);
+        sol_ptr->init_fundamentalSolutionBuffer();
+        sol_ptr->matmanager = std::get<MaterialManager<3, 6> *>(matmanager);
+        sol_ptr->deserialize_full(solver_state);
+        solver     = sol_ptr;
+
+    } else {
+        auto *mat_ptr = createMaterialManager<3, 9>(reader);
+        mat_ptr->deserialize_full(matmngr_state);
+        matmanager = mat_ptr;
+        auto *sol_ptr = createSolver<3, 9>(reader, nullptr);
+        sol_ptr->init_fundamentalSolutionBuffer();
+        sol_ptr->matmanager = std::get<MaterialManager<3, 9> *>(matmanager);
+        sol_ptr->deserialize_full(solver_state);
+        solver     = sol_ptr;
+    }
+}
+
+int MicroSimulation::get_id() { return _sim_id; }
+
 PYBIND11_MODULE(PyFANS, m)
 {
     // optional docstring
@@ -121,5 +176,22 @@ PYBIND11_MODULE(PyFANS, m)
 
     py::class_<MicroSimulation>(m, "MicroSimulation")
         .def(py::init<int>())
-        .def("solve", &MicroSimulation::solve);
+        .def("solve", &MicroSimulation::solve)
+        .def(py::pickle(
+            [](MicroSimulation& m) {
+                return py::make_tuple(m.get_id(), m.get_state());
+            },
+            [](py::tuple t) {
+                int id = t[0].cast<int>();
+                py::dict state = t[1].cast<py::dict>();
+
+                MicroSimulation* m = new MicroSimulation(id, true);
+                m->set_state(state);
+                return m;
+            }
+            ))
+        .def("set_state", &MicroSimulation::set_state)
+        .def("get_state", &MicroSimulation::get_state)
+        .def("get_global_id", &MicroSimulation::get_id)
+    ;
 }
diff --git a/pyfans/micro.hpp b/pyfans/micro.hpp
index 0e7293e..db83a04 100644
--- a/pyfans/micro.hpp
+++ b/pyfans/micro.hpp
@@ -15,15 +15,21 @@
 #include "matmodel.h"
 #include "MaterialManager.h"
 #include "solver.h"
+#include "serialization.h"
 
 namespace py = pybind11;
 
 class MicroSimulation {
   public:
-    MicroSimulation(int sim_id, char *input_file = "input.json");
+    MicroSimulation(int sim_id, bool late_init = false, char *input_file = "input.json");
     ~MicroSimulation();
     py::dict solve(py::dict macro_write_data, double dt);
 
+    py::dict get_state();
+    void set_state(py::dict &state);
+
+    int get_id();
+
   private:
     int    _sim_id;
     Reader reader;
diff --git a/src/logging.cpp b/src/logging.cpp
index 9a8a354..cc36aa6 100755
--- a/src/logging.cpp
+++ b/src/logging.cpp
@@ -6,7 +6,29 @@
 
 int active_rank = 0;
 
-Log::Logger::Logger(std::string prefix, int comm_rank, int comm_size) : _prefix(std::move(prefix)), _nullstr(), _comm_rank(comm_rank), _comm_size(comm_size) {
+Log::MPI_TraceSync::MPI_TraceSync(Logger &log, bool append) : _log(log), _append(append) { }
+Log::MPI_TraceSync::~MPI_TraceSync()
+{
+    if (active_level >= All) {
+        MPI_Barrier(_log._comm);
+        const int size = _log._comm_size;
+        for (int i = 0; i < size; i++) {
+            Log::setActiveRank(i);
+            MPI_Barrier(_log._comm);
+            _log.trace_impl(_append) << _buffer.str() << std::flush;
+        }
+        Log::setActiveRank(0);
+        MPI_Barrier(_log._comm);
+    }
+}
+
+Log::MPI_TraceSync &Log::MPI_TraceSync::operator<<(std::ostream &(*m)(std::ostream &) )
+{
+    _buffer << m;
+    return *this;
+}
+
+Log::Logger::Logger(std::string prefix, int comm_rank, int comm_size, const MPI_Comm& comm) : _prefix(std::move(prefix)), _nullstr(), _comm_rank(comm_rank), _comm_size(comm_size), _comm(comm) {
     _start_time = std::chrono::steady_clock::now();
     _nullstr.setstate(std::ios_base::badbit);
 }
@@ -55,16 +77,21 @@ std::ostream &Log::Logger::debug(bool append) {
     } else return _nullstr; 
 }
 
-std::ostream &Log::Logger::trace(bool append) {
+Log::MPI_TraceSync Log::Logger::trace(bool append) {
+    return Log::MPI_TraceSync(*this, append);
+}
+
+std::ostream &Log::Logger::trace_impl(bool append) {
     if (active_level >= All && _comm_rank == active_rank) {
         if (append) return std::cout;
-        std::cout << _prefix;
+        std::cout << _prefix << "[" << active_rank << "] ";
         auto now = std::chrono::steady_clock::now();
         auto elapse_time = static_cast<double>(std::chrono::duration_cast<std::chrono::milliseconds>(now - _start_time).count()) / 1000.0;
         std::cout << elapse_time << ": ";
         return std::cout;
-    } else return _nullstr; 
+    } else return _nullstr;
 }
+
 void Log::Logger::progress(const std::string &prefix, int step, int max) {
     if (_comm_rank != active_rank) return;
 
@@ -100,7 +127,7 @@ void Log::Logger::progress(const std::string &prefix, int step, int max) {
     std::cout.flush();
 }
 
-void Log::init(int comm_rank, int comm_size) {
+void Log::init(int comm_rank, int comm_size, const MPI_Comm& comm) {
     Level level = Log::Error;
     if constexpr (VERBOSITY <= 0) level = Error;
     if constexpr (VERBOSITY == 1) level = Info;
@@ -109,9 +136,9 @@ void Log::init(int comm_rank, int comm_size) {
     if constexpr (VERBOSITY >= 4) level = All;
 
     active_level = level;
-    general = std::make_unique<Logger>("[FANS-GENERAL] ", comm_rank, comm_size);
-    solver = std::make_unique<Logger>("[FANS-SOLVER] ", comm_rank, comm_size);
-    io = std::make_unique<Logger>("[FANS-IO] ", comm_rank, comm_size);
+    general = std::make_unique<Logger>("[FANS-GENERAL] ", comm_rank, comm_size, comm);
+    solver = std::make_unique<Logger>("[FANS-SOLVER] ", comm_rank, comm_size, comm);
+    io = std::make_unique<Logger>("[FANS-IO] ", comm_rank, comm_size, comm);
 }
 
 void Log::finalize() {
diff --git a/src/reader.cpp b/src/reader.cpp
index 5734324..b1fe343 100644
--- a/src/reader.cpp
+++ b/src/reader.cpp
@@ -57,12 +57,24 @@ void Reader::ComputeVolumeFractions()
     }
 }
 
-void Reader ::ReadInputFile(char input_fn[])
+void Reader::ReadInputFile(char input_fn[])
 {
     try {
         ifstream i(input_fn);
         json     j;
         i >> j;
+        ReadJson(j);
+
+    } catch (const std::exception &e) {
+        Log::io->error() << "ERROR trying to read input file '" << input_fn << "' for FANS\n";
+        Log::finalize();
+        exit(10);
+    }
+}
+
+void Reader::ReadJson(json j)
+{
+    try {
         inputJson = j; // Store complete input JSON for MaterialManager
 
         if (j.contains("no_mpi")) {
@@ -75,7 +87,8 @@ void Reader ::ReadInputFile(char input_fn[])
 
         MPI_Comm_rank(communicator, &world_rank);
         MPI_Comm_size(communicator, &world_size);
-        Log::init(world_rank, world_size);
+        Log::init(world_rank, world_size, communicator);
+        Log::io->trace() << "Running with total ranks=" << world_size << ", rank=" << world_rank << "\n";
 
         microstructure = j["microstructure"];
         strcpy(ms_filename, microstructure["filepath"].get<string>().c_str());
@@ -165,7 +178,7 @@ void Reader ::ReadInputFile(char input_fn[])
         Log::io->info() << "# Max iterations: \t " << n_it << "\n";
 
     } catch (const std::exception &e) {
-        Log::io->error() << "ERROR trying to read input file '" << input_fn << "' for FANS\n";
+        Log::io->error() << "ERROR trying to read input json for FANS\n";
         Log::finalize();
         exit(10);
     }
@@ -381,6 +394,7 @@ void Reader ::ReadMS(int hm)
         FANS_free(tmp);
     } else {
         /* XYZ case: the slab is already in correct order */
+        FANS_free(ms); // dealloc mem
         ms = tmp; // steal the buffer; no copy
     }
 
@@ -396,6 +410,80 @@ void Reader ::ReadMS(int hm)
     this->ComputeVolumeFractions();
 }
 
+Reader::length_t Reader::serialize_override(buffer_t &buffer, length_t offset)
+{
+    // write JSON
+    std::string j_str = inputJson.dump();
+    length_t block_size = (j_str.size()+1) * sizeof(char);
+    buffer.add_size(block_size + header_len);
+    *reinterpret_cast<length_t*>(std::data(buffer) + offset) = block_size;
+    offset += header_len;
+    std::memcpy(std::data(buffer) + offset, j_str.c_str(), block_size);
+    offset += block_size;
+
+    // write ms
+    block_size =     sizeof(bool)      +
+                 3 * sizeof(int)       +
+                 3 * sizeof(double)    +
+                 5 * sizeof(ptrdiff_t) +
+                     sizeof(int);
+    buffer.add_size(block_size);
+    *reinterpret_cast<bool*>(std::data(buffer) + offset) = is_zyx; offset += sizeof(bool);
+    std::memcpy(std::data(buffer) + offset, std::data(dims), sizeof(int)*3); offset += sizeof(int) * 3;
+    std::memcpy(std::data(buffer) + offset, std::data(l_e), sizeof(double)*3); offset += sizeof(double) * 3;
+    *reinterpret_cast<ptrdiff_t*>(std::data(buffer) + offset) = alloc_local; offset += sizeof(ptrdiff_t);
+    *reinterpret_cast<ptrdiff_t*>(std::data(buffer) + offset) = local_n0; offset += sizeof(ptrdiff_t);
+    *reinterpret_cast<ptrdiff_t*>(std::data(buffer) + offset) = local_0_start; offset += sizeof(ptrdiff_t);
+    *reinterpret_cast<ptrdiff_t*>(std::data(buffer) + offset) = local_n1; offset += sizeof(ptrdiff_t);
+    *reinterpret_cast<ptrdiff_t*>(std::data(buffer) + offset) = local_1_start; offset += sizeof(ptrdiff_t);
+    *reinterpret_cast<int*>(std::data(buffer) + offset) = n_mat; offset += sizeof(int);
+
+    block_size = local_n0 * dims[1] * dims[2] * sizeof(unsigned short);
+    buffer.add_size(block_size);
+    std::memcpy(std::data(buffer) + offset, ms, block_size);
+    offset += block_size;
+
+    return offset;
+}
+
+Reader::length_t Reader::deserialize_override(buffer_t &buffer, length_t offset)
+{
+    // write JSON
+    length_t block_size = *reinterpret_cast<length_t*>(std::data(buffer) + offset);
+    offset += header_len;
+    std::string tmp("");
+    tmp.resize(block_size / sizeof(char), '\0');
+    std::memcpy(std::data(tmp), std::data(buffer) + offset, block_size);
+    offset += block_size;
+
+    std::istringstream iss(tmp);
+    json               j;
+    iss >> j;
+    ReadJson(j);
+
+    // write ms
+    is_zyx = *reinterpret_cast<bool*>(std::data(buffer) + offset); offset += sizeof(bool);
+    dims.resize(3);
+    std::memcpy(std::data(dims), std::data(buffer) + offset, sizeof(int)*3); offset += sizeof(int) * 3;
+    l_e.resize(3);
+    std::memcpy(std::data(l_e), std::data(buffer) + offset, sizeof(double)*3); offset += sizeof(double) * 3;
+    alloc_local   = *reinterpret_cast<ptrdiff_t*>(std::data(buffer) + offset); offset += sizeof(ptrdiff_t);
+    local_n0      = *reinterpret_cast<ptrdiff_t*>(std::data(buffer) + offset); offset += sizeof(ptrdiff_t);
+    local_0_start = *reinterpret_cast<ptrdiff_t*>(std::data(buffer) + offset); offset += sizeof(ptrdiff_t);
+    local_n1      = *reinterpret_cast<ptrdiff_t*>(std::data(buffer) + offset); offset += sizeof(ptrdiff_t);
+    local_1_start = *reinterpret_cast<ptrdiff_t*>(std::data(buffer) + offset); offset += sizeof(ptrdiff_t);
+    n_mat         = *reinterpret_cast<int*>(std::data(buffer) + offset);       offset += sizeof(int);
+
+    block_size = local_n0 * dims[1] * dims[2] * sizeof(unsigned short);
+    ms = FANS_malloc<unsigned short>(static_cast<unsigned long>(local_n0) *
+                                     static_cast<unsigned long>(dims[1]) *
+                                     static_cast<unsigned long>(dims[2]));
+    std::memcpy(ms, std::data(buffer) + offset, block_size);
+    offset += block_size;
+
+    return offset;
+}
+
 void Reader::OpenResultsFile(const char *output_fn)
 {
     std::snprintf(results_filename, sizeof(results_filename), "%s", output_fn);