From 44bd80cecbc42d2e24dbdd2affabb2165960bba8 Mon Sep 17 00:00:00 2001
From: Eitan Porat <eitan.porat@weizmann.ac.il>
Date: Tue, 21 Oct 2025 18:50:32 +0000
Subject: [PATCH 1/5] Add Nonogram puzzle environment

- Add Nonogram C implementation (nonogram.c, nonogram.h, binding.c)
- Add Python wrapper (nonogram.py)
- Add Tetris-style neural network architecture for Nonogram in torch.py
  - Multi-layer CNN for grid with spatial downsampling
  - Separate encoders for row clues, column clues, and board size
  - No weight sharing between row and column encoders
- Add configuration file (nonogram.ini)
- Register NonogramLSTM and Nonogram policy classes
---
 pufferlib/config/ocean/nonogram.ini  | 152 ++++++
 pufferlib/ocean/nonogram/binding.c   |  75 +++
 pufferlib/ocean/nonogram/nonogram.c  |  32 ++
 pufferlib/ocean/nonogram/nonogram.h  | 775 +++++++++++++++++++++++++++
 pufferlib/ocean/nonogram/nonogram.py |  85 +++
 pufferlib/ocean/torch.py             |  91 ++++
 6 files changed, 1210 insertions(+)
 create mode 100644 pufferlib/config/ocean/nonogram.ini
 create mode 100644 pufferlib/ocean/nonogram/binding.c
 create mode 100644 pufferlib/ocean/nonogram/nonogram.c
 create mode 100644 pufferlib/ocean/nonogram/nonogram.h
 create mode 100644 pufferlib/ocean/nonogram/nonogram.py

diff --git a/pufferlib/config/ocean/nonogram.ini b/pufferlib/config/ocean/nonogram.ini
new file mode 100644
index 000000000..0c5d6ff44
--- /dev/null
+++ b/pufferlib/config/ocean/nonogram.ini
@@ -0,0 +1,152 @@
+[base]
+package = ocean
+env_name = puffer_nonogram
+policy_name = Nonogram
+; policy_name = Policy
+rnn_name = Recurrent
+
+[env]
+num_envs = 4096
+min_size = 4
+max_size = 8
+easy_learn = 1
+
+[sweep]
+metric = score
+
+[train]
+; Hyperparameters from wandb config
+name = pufferai
+seed = 42
+gamma = 0.99965
+device = cuda
+compile = False
+project = ablations
+use_rnn = True
+vf_coef = 2.365
+adam_eps = 1.566e-10
+data_dir = experiments
+ent_coef = 0.01554
+anneal_lr = True
+clip_coef = 0.1267
+optimizer = muon
+precision = float32
+adam_beta1 = 0.7912
+adam_beta2 = 0.999949
+batch_size = auto
+gae_lambda = 0.9007
+prio_alpha = 0.7441
+prio_beta0 = 0.7365
+cpu_offload = False
+bptt_horizon = 64
+compile_mode = max-autotune-no-cudagraphs
+vf_clip_coef = 1.598
+learning_rate = 0.007103
+max_grad_norm = 1.275
+update_epochs = 1
+vtrace_c_clip = 0.8692
+minibatch_size = 32768
+total_timesteps = 2e10
+vtrace_rho_clip = 0.9074
+compile_fullgraph = True
+max_minibatch_size = 32768
+checkpoint_interval = 200
+torch_deterministic = True
+
+; PREVIOUS RUN: Run ID 3k4cpz3ts3keprggvyldpo0bzpo2djsc
+; total_timesteps = 1e10
+; minibatch_size = 65536
+; use_rnn = True
+; update_epochs = 1
+; bptt_horizon = 64
+; gae_lambda = 0.9860112307817481
+; gamma = 0.9955237802885055
+; clip_coef = 0.3339182687952462
+; vf_coef = 1.3604733057894562
+; vf_clip_coef = 0.1
+; ent_coef = 0.01267345559258322
+; max_grad_norm = 0.7481994494317118
+; learning_rate = 0.0071601604548789605
+; adam_eps = 2.1466958248623007e-10
+; adam_beta1 = 0.9600776540257598
+; adam_beta2 = 0.9987918405974582
+; anneal_lr = True
+; optimizer = muon
+; prio_alpha = 0.9248668653880601
+; prio_beta0 = 0.9583638692801064
+; vtrace_c_clip = 2.931704492528996
+; vtrace_rho_clip = 1.2830763533710652
+
+; PREVIOUS RUN: puffer_sweep_simple_policy
+; total_timesteps = 1e10
+; minibatch_size = 16384
+; use_rnn = True
+; update_epochs = 1
+; bptt_horizon = 64
+; gae_lambda = 0.9698
+; gamma = 0.9979
+; clip_coef = 0.1896
+; vf_coef = 1.4565
+; vf_clip_coef = 0.2296
+; ent_coef = 0.01257
+; max_grad_norm = 0.4804
+; learning_rate = 0.06449
+; adam_eps = 4.577e-10
+; adam_beta1 = 0.8184
+; adam_beta2 = 0.9996
+; anneal_lr = True
+; optimizer = muon
+; prio_alpha = 0.8445
+; prio_beta0 = 0.9498
+; vtrace_c_clip = 3.5953
+; vtrace_rho_clip = 2.2273
+
+; PREVIOUS RUN: puffer_sweep_black_white_actions
+; total_timesteps = 1e10
+; minibatch_size = 32768
+; use_rnn = True
+; update_epochs = 1
+; bptt_horizon = 64
+; gae_lambda = 0.8645
+; gamma = 0.9991
+; clip_coef = 0.3043
+; vf_coef = 2.1905
+; vf_clip_coef = 3.1475
+; ent_coef = 0.002274
+; max_grad_norm = 1.0202
+; learning_rate = 0.007169
+; adam_eps = 6.036e-11
+; adam_beta1 = 0.9366
+; adam_beta2 = 0.9985
+; anneal_lr = True
+; optimizer = muon
+; prio_alpha = 0.8741
+; prio_beta0 = 0.7869
+; vtrace_c_clip = 1.6859
+; vtrace_rho_clip = 1.5254
+; total_timesteps = 10_000_000_000
+; minibatch_size = 32768
+; use_rnn = True
+; update_epochs = 1
+; bptt_horizon = 64
+; gae_lambda = 0.6
+; gamma = 0.9999
+; clip_coef = 0.01
+; vf_coef = 4.453
+; vf_clip_coef = 0.1
+; ent_coef = 0.001160
+; max_grad_norm = 1.071
+; learning_rate = 0.003555
+; adam_eps = 1.675e-14
+; adam_beta1 = 0.9817
+; adam_beta2 = 0.9052
+; anneal_lr = True
+; optimizer = muon
+; compile = False
+; precision = float32
+; torch_deterministic = True
+; checkpoint_interval = 200
+; prio_alpha = 0.99
+; prio_beta0 = 0.855
+; vtrace_c_clip = 0.7794
+; vtrace_rho_clip = 0.8655
diff --git a/pufferlib/ocean/nonogram/binding.c b/pufferlib/ocean/nonogram/binding.c
new file mode 100644
index 000000000..8f63c9d9b
--- /dev/null
+++ b/pufferlib/ocean/nonogram/binding.c
@@ -0,0 +1,75 @@
+#include <Python.h>
+#include "nonogram.h"
+
+// Forward declare custom methods
+static PyObject* vec_get_solutions(PyObject* self, PyObject* args);
+static PyObject* vec_get_size(PyObject* self, PyObject* args);
+
+#define Env Nonogram
+#define MY_METHODS \
+    {"vec_get_solutions", vec_get_solutions, METH_VARARGS, "Get solutions from all environments"}, \
+    {"vec_get_size", vec_get_size, METH_VARARGS, "Get current board size"}
+
+#include "../env_binding.h"
+
+static int my_init(Env* env, PyObject* args, PyObject* kwargs) {
+    env->min_size = unpack(kwargs, "min_size");
+    env->max_size = unpack(kwargs, "max_size");
+    env->easy_learn = unpack(kwargs, "easy_learn");
+    env->size = env->max_size;
+    env->max_steps = 4 * env->max_size * env->max_size;
+    return 0;
+}
+
+static int my_log(PyObject* dict, Log* log) {
+    assign_to_dict(dict, "score", log->score);
+    assign_to_dict(dict, "episode_return", log->episode_return);
+    assign_to_dict(dict, "episode_length", log->episode_length);
+    assign_to_dict(dict, "solved", log->solved);
+    return 0;
+}
+
+// Custom method to get solutions from all environments
+static PyObject* vec_get_solutions(PyObject* self, PyObject* args) {
+    if (PyTuple_Size(args) != 2) {
+        PyErr_SetString(PyExc_TypeError, "vec_get_solutions requires 2 arguments");
+        return NULL;
+    }
+
+    VecEnv* vec = unpack_vecenv(args);
+    if (!vec) {
+        return NULL;
+    }
+
+    PyObject* solutions_obj = PyTuple_GetItem(args, 1);
+    if (!PyObject_TypeCheck(solutions_obj, &PyArray_Type)) {
+        PyErr_SetString(PyExc_TypeError, "solutions must be a NumPy array");
+        return NULL;
+    }
+    PyArrayObject* solutions = (PyArrayObject*)solutions_obj;
+    if (!PyArray_ISCONTIGUOUS(solutions)) {
+        PyErr_SetString(PyExc_ValueError, "solutions must be contiguous");
+        return NULL;
+    }
+
+    // Copy solutions from each environment (always use max_size for buffer)
+    unsigned char* sol_ptr = PyArray_DATA(solutions);
+    int max_grid_size = MAX_SIZE * MAX_SIZE;
+    for (int i = 0; i < vec->num_envs; i++) {
+        Nonogram* env = vec->envs[i];
+        memcpy(sol_ptr + i * max_grid_size, env->solution, max_grid_size);
+    }
+
+    Py_RETURN_NONE;
+}
+
+// Get current board size from first environment
+static PyObject* vec_get_size(PyObject* self, PyObject* args) {
+    VecEnv* vec = unpack_vecenv(args);
+    if (!vec) {
+        return NULL;
+    }
+
+    Nonogram* env = vec->envs[0];
+    return PyLong_FromLong(env->size);
+}
diff --git a/pufferlib/ocean/nonogram/nonogram.c b/pufferlib/ocean/nonogram/nonogram.c
new file mode 100644
index 000000000..953be3dd1
--- /dev/null
+++ b/pufferlib/ocean/nonogram/nonogram.c
@@ -0,0 +1,32 @@
+/* Pure C demo file for Nonogram. Build it with:
+ * bash scripts/build_ocean.sh nonogram local (debug)
+ * bash scripts/build_ocean.sh nonogram fast
+ */
+
+#include "nonogram.h"
+
+int main() {
+    Nonogram env = {.size = 8};
+    int max_clues = env.size / 2;
+    int obs_size = env.size * env.size + 2 * env.size * max_clues;
+
+    env.max_steps = 4 * env.size * env.size;
+    env.observations = (unsigned char*)calloc(obs_size, sizeof(unsigned char));
+    env.actions = (int*)calloc(1, sizeof(int));
+    env.rewards = (float*)calloc(1, sizeof(float));
+    env.terminals = (unsigned char*)calloc(1, sizeof(unsigned char));
+
+    c_reset(&env);
+    c_render(&env);
+    while (!WindowShouldClose()) {
+        env.actions[0] = rand() % (env.size * env.size);
+        c_step(&env);
+        c_render(&env);
+    }
+
+    free(env.observations);
+    free(env.actions);
+    free(env.rewards);
+    free(env.terminals);
+    c_close(&env);
+}
diff --git a/pufferlib/ocean/nonogram/nonogram.h b/pufferlib/ocean/nonogram/nonogram.h
new file mode 100644
index 000000000..b4060a1de
--- /dev/null
+++ b/pufferlib/ocean/nonogram/nonogram.h
@@ -0,0 +1,775 @@
+/* Nonogram: A logic puzzle environment
+ * Players fill cells based on row and column clues (run-length encoding)
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "raylib.h"
+
+// Debug mode: set to 1 to enable debug output, 0 to disable
+#define DEBUG 0
+
+#if DEBUG
+#define debug_printf(...) printf(__VA_ARGS__)
+#else
+#define debug_printf(...) ((void)0)
+#endif
+
+#define MAX_SIZE 8
+#define MAX_CLUES (MAX_SIZE / 2)
+
+const unsigned char CELL_EMPTY = 0;
+const unsigned char CELL_WHITE = 1;
+const unsigned char CELL_BLACK = 2;
+const unsigned char CELL_PADDING = 3;
+
+const float REWARD_WIN = 1.0;
+const float REWARD_INVALID_MOVE = -0.2;
+const float REWARD_OUT_OF_BOUNDS = -0.2;
+const float REWARD_TIMEOUT = -0.1;
+const float REWARD_COMPLETE_LINE = 0.02;
+const float REWARD_EASY_LEARN_CORRECT = 0.01;
+const float REWARD_EASY_LEARN_INCORRECT = -0.01;
+const float REWARD_NO_MATCH = -0.05;
+
+// Required struct for logging
+typedef struct {
+    float score;
+    float episode_return;
+    float episode_length;
+    float solved;
+    float n;
+} Log;
+
+// Nonogram environment struct
+typedef struct {
+    Log log;
+    unsigned char* observations;
+    int* actions;
+    float* rewards;
+    unsigned char* terminals;
+
+    // Environment state
+    int size;
+    int min_size;
+    int max_size;
+    int max_steps;
+    int steps_taken;
+    int filled_total;
+    int target_total;
+    int easy_learn;
+
+    // Solution (for generating clues)
+    unsigned char solution[MAX_SIZE * MAX_SIZE];
+
+    // Clues
+    unsigned char rows_clues[MAX_SIZE * MAX_CLUES];
+    unsigned char cols_clues[MAX_SIZE * MAX_CLUES];
+    unsigned char rows_num_runs[MAX_SIZE];
+    unsigned char cols_num_runs[MAX_SIZE];
+    unsigned char rows_target_sum[MAX_SIZE];
+    unsigned char cols_target_sum[MAX_SIZE];
+    unsigned char rows_max_clue[MAX_SIZE];
+    unsigned char cols_max_clue[MAX_SIZE];
+
+    // Current totals
+    unsigned char rows_totals[MAX_SIZE];
+    unsigned char cols_totals[MAX_SIZE];
+
+    // Track completed lines
+    unsigned char rows_completed[MAX_SIZE];
+    unsigned char cols_completed[MAX_SIZE];
+
+    // Episode reward accumulator
+    float episode_reward;
+} Nonogram;
+
+// Helper function implementations
+void add_log(Nonogram* env) {
+    env->log.score += env->rewards[0];
+    env->log.episode_length += env->steps_taken;
+    env->log.episode_return += env->episode_reward;
+    env->log.solved += (env->rewards[0] > 0) ? 1 : 0;
+    env->log.n++;
+}
+
+int get_row_run_length(Nonogram* env, int row, int col) {
+    int row_start = row * MAX_SIZE;
+    int run_length = 1;
+
+    debug_printf("  get_row_run_length: row=%d, col=%d, row_start=%d\n", row, col, row_start);
+    debug_printf("  Row cells before marking: ");
+    for (int c = 0; c < env->size; c++) {
+        debug_printf("%d ", env->observations[row_start + c]);
+    }
+    debug_printf("\n");
+
+    // Count left
+    int left_count = 0;
+    for (int c = col - 1; c >= 0; c--) {
+        if (env->observations[row_start + c] == CELL_BLACK) {
+            run_length++;
+            left_count++;
+        } else {
+            break;
+        }
+    }
+    debug_printf("  Left count: %d\n", left_count);
+
+    // Count right
+    int right_count = 0;
+    for (int c = col + 1; c < env->size; c++) {
+        if (env->observations[row_start + c] == CELL_BLACK) {
+            run_length++;
+            right_count++;
+        } else {
+            break;
+        }
+    }
+    debug_printf("  Right count: %d\n", right_count);
+    debug_printf("  Total run_length (1 + left + right): %d\n", run_length);
+
+    return run_length;
+}
+
+int get_col_run_length(Nonogram* env, int row, int col) {
+    int run_length = 1;
+
+    debug_printf("  get_col_run_length: row=%d, col=%d\n", row, col);
+    debug_printf("  Col cells before marking: ");
+    for (int r = 0; r < env->size; r++) {
+        debug_printf("%d ", env->observations[r * MAX_SIZE + col]);
+    }
+    debug_printf("\n");
+
+    // Count up
+    int up_count = 0;
+    for (int r = row - 1; r >= 0; r--) {
+        if (env->observations[r * MAX_SIZE + col] == CELL_BLACK) {
+            run_length++;
+            up_count++;
+        } else {
+            break;
+        }
+    }
+    debug_printf("  Up count: %d\n", up_count);
+
+    // Count down
+    int down_count = 0;
+    for (int r = row + 1; r < env->size; r++) {
+        if (env->observations[r * MAX_SIZE + col] == CELL_BLACK) {
+            run_length++;
+            down_count++;
+        } else {
+            break;
+        }
+    }
+    debug_printf("  Down count: %d\n", down_count);
+    debug_printf("  Total run_length (1 + up + down): %d\n", run_length);
+
+    return run_length;
+}
+
+int check_line_matches(unsigned char* line_data, unsigned char* clues, int num_runs, int size) {
+    debug_printf("  check_line_matches: num_runs=%d, size=%d\n", num_runs, size);
+    debug_printf("  Line data: ");
+    for (int i = 0; i < size; i++) {
+        debug_printf("%d ", line_data[i]);
+    }
+    debug_printf("\n");
+    debug_printf("  Expected clues: ");
+    for (int i = 0; i < num_runs; i++) {
+        debug_printf("%d ", clues[i]);
+    }
+    debug_printf("\n");
+
+    int run_idx = 0;
+    int count = 0;
+
+    for (int i = 0; i < size; i++) {
+        if (line_data[i] == CELL_BLACK) {
+            count++;
+            debug_printf("  Position %d: BLACK, count=%d\n", i, count);
+        } else if (line_data[i] == CELL_EMPTY || line_data[i] == CELL_WHITE) {
+            if (count > 0) {
+                debug_printf("  End of run at position %d: count=%d, expected=%d (run_idx=%d)\n",
+                       i, count, clues[run_idx], run_idx);
+                if (clues[run_idx] != count) {
+                    debug_printf("  MISMATCH! Expected %d but got %d\n", clues[run_idx], count);
+                    return 0;
+                }
+                run_idx++;
+                count = 0;
+            }
+        }
+    }
+
+    // Check final run
+    if (count > 0) {
+        debug_printf("  Final run: count=%d, expected=%d (run_idx=%d)\n", count, clues[run_idx], run_idx);
+        if (clues[run_idx] != count) {
+            debug_printf("  FINAL MISMATCH! Expected %d but got %d\n", clues[run_idx], count);
+            return 0;
+        }
+        run_idx++;
+    }
+
+    debug_printf("  Total runs found: %d, expected: %d\n", run_idx, num_runs);
+    int matches = (run_idx == num_runs);
+    debug_printf("  Pattern matches: %d\n", matches);
+    return matches;
+}
+
+// Helper to generate random float in [0, 1]
+float rand_uniform() {
+    return (float)rand() / (float)RAND_MAX;
+}
+
+// Required functions
+void c_reset(Nonogram* env) {
+    env->size = env->min_size + (rand() % (env->max_size - env->min_size + 1));
+    env->max_steps = env->easy_learn ? env->size * env->size : 4 * env->size * env->size;
+
+    int full_grid_size = MAX_SIZE * MAX_SIZE;
+    int max_clues = MAX_SIZE / 2;
+
+    // Initialize all grid as PADDING, then clear valid cells to EMPTY (using MAX_SIZE stride)
+    memset(env->observations, CELL_PADDING, full_grid_size);
+    for (int r = 0; r < env->size; r++) {
+        for (int c = 0; c < env->size; c++) {
+            env->observations[r * MAX_SIZE + c] = CELL_EMPTY;
+        }
+    }
+    // Clear clue areas
+    memset(env->observations + full_grid_size, 0, 2 * MAX_SIZE * max_clues);
+
+    // Generate random solution using MAX_SIZE stride with uniform fill probability
+    // Sample fill probability p uniformly from [0, 1] for difficulty variation
+    float fill_prob = rand_uniform();
+    memset(env->solution, CELL_WHITE, MAX_SIZE * MAX_SIZE);
+    int has_filled = 0;
+    for (int i = 0; i < env->size; i++) {
+        for (int j = 0; j < env->size; j++) {
+            if (rand_uniform() < fill_prob) {
+                env->solution[i * MAX_SIZE + j] = CELL_BLACK;
+                has_filled = 1;
+            }
+        }
+    }
+
+    // Ensure at least one square is set
+    if (!has_filled) {
+        int rand_row = rand() % env->size;
+        int rand_col = rand() % env->size;
+        env->solution[rand_row * MAX_SIZE + rand_col] = CELL_BLACK;
+    }
+
+    // Reset clues arrays
+    memset(env->rows_clues, 0, MAX_SIZE * MAX_CLUES);
+    memset(env->cols_clues, 0, MAX_SIZE * MAX_CLUES);
+
+    // Calculate row clues
+    for (int i = 0; i < env->size; i++) {
+        int clue_idx = 0;
+        int count = 0;
+        for (int j = 0; j < env->size; j++) {
+            if (env->solution[i * MAX_SIZE + j] == CELL_BLACK) {
+                count++;
+            } else if (count > 0) {
+                env->rows_clues[i * MAX_CLUES + clue_idx] = count;
+                clue_idx++;
+                count = 0;
+            }
+        }
+        if (count > 0) {
+            env->rows_clues[i * MAX_CLUES + clue_idx] = count;
+            clue_idx++;
+        }
+        env->rows_num_runs[i] = clue_idx;
+    }
+
+    // Calculate column clues
+    for (int j = 0; j < env->size; j++) {
+        int clue_idx = 0;
+        int count = 0;
+        for (int i = 0; i < env->size; i++) {
+            if (env->solution[i * MAX_SIZE + j] == CELL_BLACK) {
+                count++;
+            } else if (count > 0) {
+                env->cols_clues[j * MAX_CLUES + clue_idx] = count;
+                clue_idx++;
+                count = 0;
+            }
+        }
+        if (count > 0) {
+            env->cols_clues[j * MAX_CLUES + clue_idx] = count;
+            clue_idx++;
+        }
+        env->cols_num_runs[j] = clue_idx;
+    }
+
+    // Store clues in observation
+    memcpy(env->observations + full_grid_size, env->rows_clues, MAX_SIZE * max_clues);
+    memcpy(env->observations + full_grid_size + MAX_SIZE * max_clues, env->cols_clues, MAX_SIZE * max_clues);
+
+    // Store board size as scalar at end of observation
+    env->observations[full_grid_size + 2 * MAX_SIZE * max_clues] = env->size;
+
+    // Calculate max clues and target sums
+    memset(env->rows_totals, 0, MAX_SIZE);
+    memset(env->cols_totals, 0, MAX_SIZE);
+    memset(env->rows_completed, 0, MAX_SIZE);
+    memset(env->cols_completed, 0, MAX_SIZE);
+    env->filled_total = 0;
+
+    for (int i = 0; i < env->size; i++) {
+        // Find max clue for row
+        int max_clue = 0;
+        int sum = 0;
+        for (int j = 0; j < max_clues; j++) {
+            int clue = env->rows_clues[i * MAX_CLUES + j];
+            if (clue > max_clue) {
+                max_clue = clue;
+            }
+            sum += clue;
+        }
+        env->rows_max_clue[i] = max_clue;
+        env->rows_target_sum[i] = sum;
+
+        // Find max clue for col
+        max_clue = 0;
+        sum = 0;
+        for (int j = 0; j < max_clues; j++) {
+            int clue = env->cols_clues[i * MAX_CLUES + j];
+            if (clue > max_clue) {
+                max_clue = clue;
+            }
+            sum += clue;
+        }
+        env->cols_max_clue[i] = max_clue;
+        env->cols_target_sum[i] = sum;
+    }
+
+    // Calculate target total
+    env->target_total = 0;
+    for (int i = 0; i < env->size; i++) {
+        env->target_total += env->rows_target_sum[i];
+    }
+
+    // Debug: print solution and clues
+    debug_printf("\n=== RESET: New puzzle generated (size=%d) ===\n", env->size);
+    debug_printf("Solution grid:\n");
+    for (int r = 0; r < env->size; r++) {
+        debug_printf("  Row %d: ", r);
+        for (int c = 0; c < env->size; c++) {
+            debug_printf("%d ", env->solution[r * MAX_SIZE + c]);
+        }
+        debug_printf("\n");
+    }
+
+    debug_printf("\nRow clues:\n");
+    for (int r = 0; r < env->size; r++) {
+        debug_printf("  Row %d (num_runs=%d, target_sum=%d, max_clue=%d): ",
+               r, env->rows_num_runs[r], env->rows_target_sum[r], env->rows_max_clue[r]);
+        for (int i = 0; i < MAX_CLUES; i++) {
+            int clue = env->rows_clues[r * MAX_CLUES + i];
+            if (clue > 0) {
+                debug_printf("%d ", clue);
+            }
+        }
+        debug_printf("\n");
+    }
+
+    debug_printf("\nColumn clues:\n");
+    for (int c = 0; c < env->size; c++) {
+        debug_printf("  Col %d (num_runs=%d, target_sum=%d, max_clue=%d): ",
+               c, env->cols_num_runs[c], env->cols_target_sum[c], env->cols_max_clue[c]);
+        for (int i = 0; i < MAX_CLUES; i++) {
+            int clue = env->cols_clues[c * MAX_CLUES + i];
+            if (clue > 0) {
+                debug_printf("%d ", clue);
+            }
+        }
+        debug_printf("\n");
+    }
+
+    debug_printf("\nTarget total BLACK cells: %d\n", env->target_total);
+    debug_printf("===================================\n\n");
+
+    env->steps_taken = 0;
+    env->episode_reward = 0;
+}
+
+void c_step(Nonogram* env) {
+    int action = env->actions[0];
+
+    env->terminals[0] = 0;
+    env->rewards[0] = 0;
+
+    env->steps_taken++;
+
+    debug_printf("DEBUG c_step: action=%d, steps=%d\n", action, env->steps_taken);
+
+    // Check timeout FIRST before any game logic
+    if (env->steps_taken > env->max_steps) {
+        debug_printf("DEBUG: TIMEOUT\n");
+        env->terminals[0] = 1;
+        env->rewards[0] = REWARD_TIMEOUT;
+        env->episode_reward += REWARD_TIMEOUT;
+        add_log(env);
+        c_reset(env);
+        return;
+    }
+
+    // Decode action: 0-63 = mark WHITE, 64-127 = mark BLACK
+    int mark_black = action >= (MAX_SIZE * MAX_SIZE);
+    int pos = action % (MAX_SIZE * MAX_SIZE);
+
+    debug_printf("DEBUG: mark_black=%d, pos=%d\n", mark_black, pos);
+
+    // Convert position to row/col using MAX_SIZE stride
+    int row = pos / MAX_SIZE;
+    int col = pos % MAX_SIZE;
+
+    debug_printf("DEBUG: row=%d, col=%d, size=%d\n", row, col, env->size);
+
+    // Check if action is out of bounds (hitting padding area)
+    if (row >= env->size || col >= env->size) {
+        debug_printf("DEBUG: OUT OF BOUNDS (row=%d, col=%d >= size=%d)\n", row, col, env->size);
+        env->terminals[0] = 1;
+        env->rewards[0] = REWARD_OUT_OF_BOUNDS;
+        env->episode_reward += REWARD_OUT_OF_BOUNDS;
+        add_log(env);
+        c_reset(env);
+        return;
+    }
+
+    unsigned char current = env->observations[pos];
+
+    debug_printf("DEBUG: current cell value=%d (EMPTY=%d, WHITE=%d, BLACK=%d, PADDING=%d)\n",
+           current, CELL_EMPTY, CELL_WHITE, CELL_BLACK, CELL_PADDING);
+
+    // Can't mark a cell that's already been marked
+    if (current != CELL_EMPTY) {
+        debug_printf("DEBUG: INVALID - cell already marked (current=%d)\n", current);
+        env->terminals[0] = 1;
+        env->rewards[0] = REWARD_INVALID_MOVE;
+        env->episode_reward += REWARD_INVALID_MOVE;
+        add_log(env);
+        c_reset(env);
+        return;
+    }
+
+    // Mark cell as BLACK or WHITE
+    if (mark_black) {
+        debug_printf("DEBUG: Marking BLACK\n");
+        // Marking BLACK - check if valid
+        // First check: totals equal target - invalid move (terminate episode)
+        debug_printf("DEBUG: rows_totals[%d]=%d, rows_target_sum[%d]=%d\n",
+               row, env->rows_totals[row], row, env->rows_target_sum[row]);
+        debug_printf("DEBUG: cols_totals[%d]=%d, cols_target_sum[%d]=%d\n",
+               col, env->cols_totals[col], col, env->cols_target_sum[col]);
+
+        if (env->rows_totals[row] == env->rows_target_sum[row] ||
+            env->cols_totals[col] == env->cols_target_sum[col]) {
+            debug_printf("DEBUG: INVALID - row or col already full\n");
+            env->terminals[0] = 1;
+            env->rewards[0] = REWARD_INVALID_MOVE;
+            env->episode_reward += REWARD_INVALID_MOVE;
+            add_log(env);
+            c_reset(env);
+            return;
+        }
+
+        // Check if marking this cell BLACK would create a run longer than max allowed
+        int row_run = get_row_run_length(env, row, col);
+        debug_printf("DEBUG: row_run_length=%d, rows_max_clue[%d]=%d\n",
+               row_run, row, env->rows_max_clue[row]);
+
+        if (row_run > env->rows_max_clue[row]) {
+            debug_printf("DEBUG: INVALID - row run too long\n");
+            env->terminals[0] = 1;
+            env->rewards[0] = REWARD_INVALID_MOVE;
+            env->episode_reward += REWARD_INVALID_MOVE;
+            add_log(env);
+            c_reset(env);
+            return;
+        }
+
+        int col_run = get_col_run_length(env, row, col);
+        debug_printf("DEBUG: col_run_length=%d, cols_max_clue[%d]=%d\n",
+               col_run, col, env->cols_max_clue[col]);
+
+        if (col_run > env->cols_max_clue[col]) {
+            debug_printf("DEBUG: INVALID - col run too long\n");
+            env->terminals[0] = 1;
+            env->rewards[0] = REWARD_INVALID_MOVE;
+            env->episode_reward += REWARD_INVALID_MOVE;
+            add_log(env);
+            c_reset(env);
+            return;
+        }
+
+        // Second check: if completing row/col, check runs match
+        int row_completed = 0;
+        int col_completed = 0;
+
+        debug_printf("DEBUG: Checking line completion...\n");
+
+        if (env->rows_totals[row] == env->rows_target_sum[row] - 1) {
+            debug_printf("DEBUG: Would complete row %d, checking pattern...\n", row);
+            // Temporarily mark BLACK to check
+            env->observations[pos] = CELL_BLACK;
+            int row_start = row * MAX_SIZE;
+            int matches = check_line_matches(env->observations + row_start,
+                                   env->rows_clues + row * MAX_CLUES,
+                                   env->rows_num_runs[row], env->size);
+            debug_printf("DEBUG: Row pattern matches: %d\n", matches);
+            if (!matches) {
+                // Runs don't match - invalid move (terminate episode)
+                debug_printf("DEBUG: INVALID - row pattern doesn't match\n");
+                env->observations[pos] = CELL_EMPTY;
+                env->terminals[0] = 1;
+                env->rewards[0] = REWARD_NO_MATCH;
+                env->episode_reward += REWARD_NO_MATCH;
+                add_log(env);
+                c_reset(env);
+                return;
+            }
+            env->observations[pos] = CELL_EMPTY;
+            row_completed = 1;
+        }
+
+        if (env->cols_totals[col] == env->cols_target_sum[col] - 1) {
+            debug_printf("DEBUG: Would complete col %d, checking pattern...\n", col);
+            // Temporarily mark BLACK to check
+            env->observations[pos] = CELL_BLACK;
+            unsigned char col_data[MAX_SIZE];
+            for (int i = 0; i < env->size; i++) {
+                col_data[i] = env->observations[i * MAX_SIZE + col];
+            }
+            int matches = check_line_matches(col_data,
+                                   env->cols_clues + col * MAX_CLUES,
+                                   env->cols_num_runs[col], env->size);
+            debug_printf("DEBUG: Col pattern matches: %d\n", matches);
+            if (!matches) {
+                // Runs don't match - invalid move (terminate episode)
+                debug_printf("DEBUG: INVALID - col pattern doesn't match\n");
+                env->observations[pos] = CELL_EMPTY;
+                env->terminals[0] = 1;
+                env->rewards[0] = REWARD_NO_MATCH;
+                env->episode_reward += REWARD_NO_MATCH;
+                add_log(env);
+                c_reset(env);
+                return;
+            }
+            env->observations[pos] = CELL_EMPTY;
+            col_completed = 1;
+        }
+
+        // Apply mark BLACK
+        env->observations[pos] = CELL_BLACK;
+        env->rows_totals[row]++;
+        env->cols_totals[col]++;
+        env->filled_total++;
+
+        // Give reward for newly completed lines only
+        int row_newly_completed = row_completed && !env->rows_completed[row];
+        int col_newly_completed = col_completed && !env->cols_completed[col];
+
+        if (row_newly_completed) env->rows_completed[row] = 1;
+        if (col_newly_completed) env->cols_completed[col] = 1;
+
+        float line_reward = (row_newly_completed + col_newly_completed) * REWARD_COMPLETE_LINE;
+        env->rewards[0] += line_reward;
+        env->episode_reward += line_reward;
+    } else {
+        // Marking WHITE - always valid (just marks empty as not-black)
+        env->observations[pos] = CELL_WHITE;
+    }
+
+    // Easy learn mode: check if cell matches solution
+    if (env->easy_learn) {
+        unsigned char solution_cell = env->solution[pos];
+        unsigned char actual = env->observations[pos];
+
+        if (solution_cell == actual) {
+            // Correct move: give positive reward and continue
+            env->rewards[0] += REWARD_EASY_LEARN_CORRECT;
+            env->episode_reward += REWARD_EASY_LEARN_CORRECT;
+        } else {
+            // Incorrect move: give negative reward, terminate and reset
+            env->rewards[0] += REWARD_EASY_LEARN_INCORRECT;
+            env->episode_reward += REWARD_EASY_LEARN_INCORRECT;
+            env->terminals[0] = 1;
+            add_log(env);
+            c_reset(env);
+            return;
+        }
+    }
+
+    // Check if solved (filled_total == target_total means all BLACK cells placed correctly)
+    if (env->filled_total == env->target_total) {
+        env->terminals[0] = 1;
+        env->rewards[0] = REWARD_WIN;
+        env->episode_reward += REWARD_WIN;
+        add_log(env);
+        c_reset(env);
+        return;
+    }
+}
+
+void c_render(Nonogram* env) {
+    if (!IsWindowReady()) {
+        int board_width = 120 + MAX_SIZE * 40;
+        int board_height = 120 + MAX_SIZE * 40;
+        int screen_width = board_width * 2 + 60 + 40;
+        int screen_height = board_height + 140;
+        InitWindow(screen_width, screen_height, "Nonogram (C)");
+        SetTargetFPS(60);
+    }
+
+    if (IsKeyDown(KEY_ESCAPE)) {
+        exit(0);
+    }
+
+    BeginDrawing();
+    ClearBackground((Color){0, 0, 0, 255});
+
+    int cell_size = 40;
+    int clue_area = 120;
+    int board_spacing = 60;
+    int font_size = 20;
+
+    // Draw titles
+    DrawText("CURRENT BOARD", 20, 20, 24, RAYWHITE);
+    int solution_x = clue_area + env->size * cell_size + board_spacing + 20;
+    DrawText("SOLUTION", solution_x, 20, 24, RAYWHITE);
+
+    // Draw current board
+    int offset_x = 20;
+    int offset_y = 60;
+
+    // Draw column clues for current board
+    for (int clue_row = 0; clue_row < MAX_CLUES; clue_row++) {
+        for (int c = 0; c < env->size; c++) {
+            int clue = env->cols_clues[c * MAX_CLUES + clue_row];
+            if (clue > 0) {
+                char text[4];
+                snprintf(text, sizeof(text), "%d", clue);
+                int x = offset_x + clue_area + c * cell_size + cell_size / 2;
+                int y = offset_y + clue_row * 20 + 10;
+                int text_width = MeasureText(text, font_size);
+                DrawText(text, x - text_width / 2, y, font_size, RAYWHITE);
+            }
+        }
+    }
+
+    // Draw row clues for current board
+    for (int r = 0; r < env->size; r++) {
+        int clue_x = offset_x + 10;
+        for (int clue_idx = 0; clue_idx < MAX_CLUES; clue_idx++) {
+            int clue = env->rows_clues[r * MAX_CLUES + clue_idx];
+            if (clue > 0) {
+                char text[4];
+                snprintf(text, sizeof(text), "%d", clue);
+                int y = offset_y + clue_area + r * cell_size + cell_size / 2 - font_size / 2;
+                DrawText(text, clue_x, y, font_size, RAYWHITE);
+                clue_x += MeasureText(text, font_size) + 5;
+            }
+        }
+    }
+
+    // Draw current grid
+    for (int r = 0; r < env->size; r++) {
+        for (int c = 0; c < env->size; c++) {
+            int x = offset_x + clue_area + c * cell_size;
+            int y = offset_y + clue_area + r * cell_size;
+            int pos = r * MAX_SIZE + c;
+
+            if (env->observations[pos] == CELL_BLACK) {
+                DrawRectangle(x, y, cell_size, cell_size, (Color){50, 50, 50, 255});  // Dark gray for BLACK
+            } else if (env->observations[pos] == CELL_WHITE) {
+                DrawRectangle(x, y, cell_size, cell_size, (Color){240, 240, 240, 255});  // Light gray for WHITE
+            } else {
+                DrawRectangle(x, y, cell_size, cell_size, (Color){120, 120, 120, 255});  // Medium gray for EMPTY
+            }
+            DrawRectangleLines(x, y, cell_size, cell_size, LIGHTGRAY);
+        }
+    }
+
+    // Draw solution board
+    offset_x = solution_x;
+
+    // Draw column clues for solution
+    for (int clue_row = 0; clue_row < MAX_CLUES; clue_row++) {
+        for (int c = 0; c < env->size; c++) {
+            int clue = env->cols_clues[c * MAX_CLUES + clue_row];
+            if (clue > 0) {
+                char text[4];
+                snprintf(text, sizeof(text), "%d", clue);
+                int x = offset_x + clue_area + c * cell_size + cell_size / 2;
+                int y = offset_y + clue_row * 20 + 10;
+                int text_width = MeasureText(text, font_size);
+                DrawText(text, x - text_width / 2, y, font_size, RAYWHITE);
+            }
+        }
+    }
+
+    // Draw row clues for solution
+    for (int r = 0; r < env->size; r++) {
+        int clue_x = offset_x + 10;
+        for (int clue_idx = 0; clue_idx < MAX_CLUES; clue_idx++) {
+            int clue = env->rows_clues[r * MAX_CLUES + clue_idx];
+            if (clue > 0) {
+                char text[4];
+                snprintf(text, sizeof(text), "%d", clue);
+                int y = offset_y + clue_area + r * cell_size + cell_size / 2 - font_size / 2;
+                DrawText(text, clue_x, y, font_size, RAYWHITE);
+                clue_x += MeasureText(text, font_size) + 5;
+            }
+        }
+    }
+
+    // Draw solution grid
+    for (int r = 0; r < env->size; r++) {
+        for (int c = 0; c < env->size; c++) {
+            int x = offset_x + clue_area + c * cell_size;
+            int y = offset_y + clue_area + r * cell_size;
+            int pos = r * MAX_SIZE + c;
+
+            if (env->solution[pos] == CELL_BLACK) {
+                DrawRectangle(x, y, cell_size, cell_size, GREEN);
+            } else {
+                DrawRectangle(x, y, cell_size, cell_size, (Color){200, 200, 200, 255});
+            }
+            DrawRectangleLines(x, y, cell_size, cell_size, LIGHTGRAY);
+        }
+    }
+
+    // Draw status
+    int board_height = clue_area + env->size * cell_size;
+    int status_y = board_height + 80;
+    char status[128];
+    snprintf(status, sizeof(status), "Steps: %d/%d | Filled: %d/%d | Size: %dx%d",
+             env->steps_taken, env->max_steps, env->filled_total, env->target_total, env->size, env->size);
+    DrawText(status, 20, status_y, 20, RAYWHITE);
+
+    // Draw reward info
+    char reward_info[128];
+    snprintf(reward_info, sizeof(reward_info), "Last Reward: %.3f | Episode Return: %.3f",
+             env->rewards[0], env->episode_reward);
+    DrawText(reward_info, 20, status_y + 25, 20, RAYWHITE);
+
+    // Draw instructions
+    DrawText("Click cells to toggle | Press R to reset | ESC to quit", 20, status_y + 60, 16, LIGHTGRAY);
+
+    EndDrawing();
+}
+
+void c_close(Nonogram* env) {
+    if (IsWindowReady()) {
+        CloseWindow();
+    }
+}
diff --git a/pufferlib/ocean/nonogram/nonogram.py b/pufferlib/ocean/nonogram/nonogram.py
new file mode 100644
index 000000000..c58aa1180
--- /dev/null
+++ b/pufferlib/ocean/nonogram/nonogram.py
@@ -0,0 +1,85 @@
+'''Nonogram logic puzzle environment'''
+
+import gymnasium
+import numpy as np
+
+import pufferlib
+from pufferlib.ocean.nonogram import binding
+
+MAX_SIZE = 8
+MIN_SIZE = 4
+MAX_CLUES = MAX_SIZE // 2
+OBS_SIZE = MAX_SIZE * MAX_SIZE + 2 * MAX_SIZE * MAX_CLUES + 1  # +1 for board size
+
+class Nonogram(pufferlib.PufferEnv):
+    def __init__(self, num_envs=1, render_mode=None, log_interval=128,
+                 min_size=4, max_size=8, easy_learn=0, buf=None, seed=0):
+        # Observation space: grid cells (0-3: EMPTY/WHITE/BLACK/PADDING), clues (0-max_size), size encoding (0-1)
+        # Using max_size as high covers all values
+        self.single_observation_space = gymnasium.spaces.Box(low=0, high=max_size,
+            shape=(OBS_SIZE,), dtype=np.uint8)
+        # Action space: 0-63 = mark WHITE, 64-127 = mark BLACK
+        self.single_action_space = gymnasium.spaces.Discrete(MAX_SIZE * MAX_SIZE * 2)
+        self.render_mode = render_mode
+        self.num_agents = num_envs
+        self.log_interval = log_interval
+
+        super().__init__(buf)
+        self.c_envs = binding.vec_init(self.observations, self.actions, self.rewards,
+            self.terminals, self.truncations, num_envs, seed,
+            min_size=min_size, max_size=max_size, easy_learn=easy_learn)
+
+        self.solutions = np.zeros((num_envs, max_size * max_size), dtype=np.uint8)
+
+    def reset(self, seed=0):
+        binding.vec_reset(self.c_envs, seed)
+        self.tick = 0
+        return self.observations, []
+
+    def step(self, actions):
+        self.tick += 1
+
+        self.actions[:] = actions
+        binding.vec_step(self.c_envs)
+
+        info = []
+        if self.tick % self.log_interval == 0:
+            info.append(binding.vec_log(self.c_envs))
+
+        return (self.observations, self.rewards,
+            self.terminals, self.truncations, info)
+
+    def render(self):
+        binding.vec_render(self.c_envs, 0)
+
+    def close(self):
+        binding.vec_close(self.c_envs)
+
+    def get_solutions(self):
+        """Get the solution grids for all environments"""
+        binding.vec_get_solutions(self.c_envs, self.solutions)
+        return self.solutions
+
+    def get_size(self):
+        """Get current board size"""
+        return binding.vec_get_size(self.c_envs)
+
+if __name__ == '__main__':
+    N = 4096
+
+    env = Nonogram(num_envs=N, min_size=2, max_size=8)
+    env.reset()
+    steps = 0
+
+    CACHE = 1024
+    actions = np.random.randint(0, 64, (CACHE, N))
+
+    i = 0
+    import time
+    start = time.time()
+    while time.time() - start < 10:
+        env.step(actions[i % CACHE])
+        steps += N
+        i += 1
+
+    print('Nonogram SPS:', int(steps / (time.time() - start)))
diff --git a/pufferlib/ocean/torch.py b/pufferlib/ocean/torch.py
index c414acde2..8da1a5935 100644
--- a/pufferlib/ocean/torch.py
+++ b/pufferlib/ocean/torch.py
@@ -942,6 +942,97 @@ def decode_actions(self, hidden):
         value = self.value_fn(hidden)  # (B, 1)
         return action, value
 
+class NonogramLSTM(pufferlib.models.LSTMWrapper):
+    def __init__(self, env, policy, input_size=256, hidden_size=256):
+        super().__init__(env, policy, input_size, hidden_size)
+
+
+class Nonogram(nn.Module):
+    def __init__(self, env, cnn_channels=32, input_size=128, hidden_size=128, **kwargs):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.is_continuous = False
+
+        # Tetris-style architecture: multi-layer CNN for grid + separate scalar encoder
+
+        # Grid CNN (like Tetris): multiple conv layers with strides
+        self.conv_grid = nn.Sequential(
+            pufferlib.pytorch.layer_init(nn.Conv2d(4, cnn_channels, kernel_size=3, stride=1, padding=1)),
+            nn.ReLU(),
+            pufferlib.pytorch.layer_init(nn.Conv2d(cnn_channels, cnn_channels, kernel_size=3, stride=2, padding=1)),
+            nn.ReLU(),
+            pufferlib.pytorch.layer_init(nn.Conv2d(cnn_channels, cnn_channels, kernel_size=3, stride=2, padding=1)),
+            nn.ReLU(),
+            nn.Flatten(),
+            pufferlib.pytorch.layer_init(nn.Linear(cnn_channels * 2 * 2, input_size)),
+        )
+
+        # Separate encoders for row clues, column clues, and size (NO weight sharing)
+        self.fc_row_clues = nn.Sequential(
+            pufferlib.pytorch.layer_init(nn.Linear(8 * 4 * 9, input_size // 2)),
+            nn.ReLU(),
+        )
+
+        self.fc_col_clues = nn.Sequential(
+            pufferlib.pytorch.layer_init(nn.Linear(8 * 4 * 9, input_size // 2)),
+            nn.ReLU(),
+        )
+
+        self.fc_size = nn.Sequential(
+            pufferlib.pytorch.layer_init(nn.Linear(9, input_size // 4)),
+            nn.ReLU(),
+        )
+
+        # Projection layer (like Tetris): combine grid and all scalar features
+        # input_size (grid) + input_size//2 (rows) + input_size//2 (cols) + input_size//4 (size) = 2.25 * input_size
+        self.proj = nn.Sequential(
+            pufferlib.pytorch.layer_init(nn.Linear(input_size + input_size // 2 + input_size // 2 + input_size // 4, hidden_size)),
+            nn.ReLU(),
+        )
+
+        # Output heads
+        self.actor = pufferlib.pytorch.layer_init(
+            nn.Linear(hidden_size, env.single_action_space.n), std=0.01)
+        self.value_fn = pufferlib.pytorch.layer_init(
+            nn.Linear(hidden_size, 1), std=1)
+
+    def forward(self, observations, state=None):
+        hidden = self.encode_observations(observations)
+        actions, value = self.decode_actions(hidden)
+        return actions, value
+
+    def forward_train(self, x, state=None):
+        return self.forward(x, state)
+
+    def encode_observations(self, observations, state=None):
+        B = observations.shape[0]
+
+        # Parse observations
+        grid = F.one_hot(observations[:, :64].view(B, 8, 8).long(), 4).permute(0, 3, 1, 2).float()  # (B, 4, 8, 8)
+        row_clues = F.one_hot(observations[:, 64:96].view(B, 8, 4).long(), 9).float()  # (B, 8, 4, 9)
+        col_clues = F.one_hot(observations[:, 96:128].view(B, 8, 4).long(), 9).float()  # (B, 8, 4, 9)
+        board_size = F.one_hot(observations[:, 128].long(), 9).float()  # (B, 9)
+
+        # Process grid through CNN (Tetris-style)
+        grid_feat = self.conv_grid(grid)  # (B, input_size)
+
+        # Process scalar features separately (NO weight sharing)
+        row_feat = self.fc_row_clues(row_clues.reshape(B, -1))  # (B, input_size//2)
+        col_feat = self.fc_col_clues(col_clues.reshape(B, -1))  # (B, input_size//2)
+        size_feat = self.fc_size(board_size)  # (B, input_size//4)
+
+        # Combine and project (Tetris-style)
+        combined = torch.cat([grid_feat, row_feat, col_feat, size_feat], dim=-1)
+        features = self.proj(combined)  # (B, hidden_size)
+
+        return features
+
+    def decode_actions(self, flat_hidden):
+        action = self.actor(flat_hidden)
+        value = self.value_fn(flat_hidden)
+        return action, value
+
+
 class Drone(nn.Module):
     ''' Drone policy. Flattens obs and applies a linear layer.
     '''

From b5da2cc023cdc4c401004ae3b5059f256dcda9aa Mon Sep 17 00:00:00 2001
From: Eitan Porat <eitan.porat@weizmann.ac.il>
Date: Tue, 21 Oct 2025 20:50:23 +0000
Subject: [PATCH 2/5] Register env

---
 pufferlib/ocean/environment.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pufferlib/ocean/environment.py b/pufferlib/ocean/environment.py
index ed1408ba6..b22c5f0ee 100644
--- a/pufferlib/ocean/environment.py
+++ b/pufferlib/ocean/environment.py
@@ -126,6 +126,7 @@ def make_multiagent(buf=None, **kwargs):
     'freeway': 'Freeway',
     'enduro': 'Enduro',
     'tetris': 'Tetris',
+    'nonogram': 'Nonogram',
     'cartpole': 'Cartpole',
     'moba': 'Moba',
     'matsci': 'Matsci',

From ba94269c36cacc56829611a480af9c629f06ca6a Mon Sep 17 00:00:00 2001
From: Eitan Porat <eitan.porat@weizmann.ac.il>
Date: Tue, 21 Oct 2025 21:13:39 +0000
Subject: [PATCH 3/5] clean up code

---
 pufferlib/config/ocean/nonogram.ini | 102 +--------------
 pufferlib/ocean/nonogram/nonogram.h | 184 +---------------------------
 pufferlib/ocean/torch.py            |  30 ++---
 3 files changed, 11 insertions(+), 305 deletions(-)

diff --git a/pufferlib/config/ocean/nonogram.ini b/pufferlib/config/ocean/nonogram.ini
index 0c5d6ff44..7f203644a 100644
--- a/pufferlib/config/ocean/nonogram.ini
+++ b/pufferlib/config/ocean/nonogram.ini
@@ -2,20 +2,18 @@
 package = ocean
 env_name = puffer_nonogram
 policy_name = Nonogram
-; policy_name = Policy
 rnn_name = Recurrent
 
 [env]
 num_envs = 4096
 min_size = 4
 max_size = 8
-easy_learn = 1
+easy_learn = 0
 
 [sweep]
 metric = score
 
 [train]
-; Hyperparameters from wandb config
 name = pufferai
 seed = 42
 gamma = 0.99965
@@ -52,101 +50,3 @@ compile_fullgraph = True
 max_minibatch_size = 32768
 checkpoint_interval = 200
 torch_deterministic = True
-
-; PREVIOUS RUN: Run ID 3k4cpz3ts3keprggvyldpo0bzpo2djsc
-; total_timesteps = 1e10
-; minibatch_size = 65536
-; use_rnn = True
-; update_epochs = 1
-; bptt_horizon = 64
-; gae_lambda = 0.9860112307817481
-; gamma = 0.9955237802885055
-; clip_coef = 0.3339182687952462
-; vf_coef = 1.3604733057894562
-; vf_clip_coef = 0.1
-; ent_coef = 0.01267345559258322
-; max_grad_norm = 0.7481994494317118
-; learning_rate = 0.0071601604548789605
-; adam_eps = 2.1466958248623007e-10
-; adam_beta1 = 0.9600776540257598
-; adam_beta2 = 0.9987918405974582
-; anneal_lr = True
-; optimizer = muon
-; prio_alpha = 0.9248668653880601
-; prio_beta0 = 0.9583638692801064
-; vtrace_c_clip = 2.931704492528996
-; vtrace_rho_clip = 1.2830763533710652
-
-; PREVIOUS RUN: puffer_sweep_simple_policy
-; total_timesteps = 1e10
-; minibatch_size = 16384
-; use_rnn = True
-; update_epochs = 1
-; bptt_horizon = 64
-; gae_lambda = 0.9698
-; gamma = 0.9979
-; clip_coef = 0.1896
-; vf_coef = 1.4565
-; vf_clip_coef = 0.2296
-; ent_coef = 0.01257
-; max_grad_norm = 0.4804
-; learning_rate = 0.06449
-; adam_eps = 4.577e-10
-; adam_beta1 = 0.8184
-; adam_beta2 = 0.9996
-; anneal_lr = True
-; optimizer = muon
-; prio_alpha = 0.8445
-; prio_beta0 = 0.9498
-; vtrace_c_clip = 3.5953
-; vtrace_rho_clip = 2.2273
-
-; PREVIOUS RUN: puffer_sweep_black_white_actions
-; total_timesteps = 1e10
-; minibatch_size = 32768
-; use_rnn = True
-; update_epochs = 1
-; bptt_horizon = 64
-; gae_lambda = 0.8645
-; gamma = 0.9991
-; clip_coef = 0.3043
-; vf_coef = 2.1905
-; vf_clip_coef = 3.1475
-; ent_coef = 0.002274
-; max_grad_norm = 1.0202
-; learning_rate = 0.007169
-; adam_eps = 6.036e-11
-; adam_beta1 = 0.9366
-; adam_beta2 = 0.9985
-; anneal_lr = True
-; optimizer = muon
-; prio_alpha = 0.8741
-; prio_beta0 = 0.7869
-; vtrace_c_clip = 1.6859
-; vtrace_rho_clip = 1.5254
-; total_timesteps = 10_000_000_000
-; minibatch_size = 32768
-; use_rnn = True
-; update_epochs = 1
-; bptt_horizon = 64
-; gae_lambda = 0.6
-; gamma = 0.9999
-; clip_coef = 0.01
-; vf_coef = 4.453
-; vf_clip_coef = 0.1
-; ent_coef = 0.001160
-; max_grad_norm = 1.071
-; learning_rate = 0.003555
-; adam_eps = 1.675e-14
-; adam_beta1 = 0.9817
-; adam_beta2 = 0.9052
-; anneal_lr = True
-; optimizer = muon
-; compile = False
-; precision = float32
-; torch_deterministic = True
-; checkpoint_interval = 200
-; prio_alpha = 0.99
-; prio_beta0 = 0.855
-; vtrace_c_clip = 0.7794
-; vtrace_rho_clip = 0.8655
diff --git a/pufferlib/ocean/nonogram/nonogram.h b/pufferlib/ocean/nonogram/nonogram.h
index b4060a1de..f41f6244e 100644
--- a/pufferlib/ocean/nonogram/nonogram.h
+++ b/pufferlib/ocean/nonogram/nonogram.h
@@ -7,15 +7,6 @@
 #include <string.h>
 #include "raylib.h"
 
-// Debug mode: set to 1 to enable debug output, 0 to disable
-#define DEBUG 0
-
-#if DEBUG
-#define debug_printf(...) printf(__VA_ARGS__)
-#else
-#define debug_printf(...) ((void)0)
-#endif
-
 #define MAX_SIZE 8
 #define MAX_CLUES (MAX_SIZE / 2)
 
@@ -33,7 +24,6 @@ const float REWARD_EASY_LEARN_CORRECT = 0.01;
 const float REWARD_EASY_LEARN_INCORRECT = -0.01;
 const float REWARD_NO_MATCH = -0.05;
 
-// Required struct for logging
 typedef struct {
     float score;
     float episode_return;
@@ -42,7 +32,6 @@ typedef struct {
     float n;
 } Log;
 
-// Nonogram environment struct
 typedef struct {
     Log log;
     unsigned char* observations;
@@ -50,7 +39,6 @@ typedef struct {
     float* rewards;
     unsigned char* terminals;
 
-    // Environment state
     int size;
     int min_size;
     int max_size;
@@ -60,10 +48,8 @@ typedef struct {
     int target_total;
     int easy_learn;
 
-    // Solution (for generating clues)
     unsigned char solution[MAX_SIZE * MAX_SIZE];
 
-    // Clues
     unsigned char rows_clues[MAX_SIZE * MAX_CLUES];
     unsigned char cols_clues[MAX_SIZE * MAX_CLUES];
     unsigned char rows_num_runs[MAX_SIZE];
@@ -73,19 +59,15 @@ typedef struct {
     unsigned char rows_max_clue[MAX_SIZE];
     unsigned char cols_max_clue[MAX_SIZE];
 
-    // Current totals
     unsigned char rows_totals[MAX_SIZE];
     unsigned char cols_totals[MAX_SIZE];
 
-    // Track completed lines
     unsigned char rows_completed[MAX_SIZE];
     unsigned char cols_completed[MAX_SIZE];
 
-    // Episode reward accumulator
     float episode_reward;
 } Nonogram;
 
-// Helper function implementations
 void add_log(Nonogram* env) {
     env->log.score += env->rewards[0];
     env->log.episode_length += env->steps_taken;
@@ -98,37 +80,21 @@ int get_row_run_length(Nonogram* env, int row, int col) {
     int row_start = row * MAX_SIZE;
     int run_length = 1;
 
-    debug_printf("  get_row_run_length: row=%d, col=%d, row_start=%d\n", row, col, row_start);
-    debug_printf("  Row cells before marking: ");
-    for (int c = 0; c < env->size; c++) {
-        debug_printf("%d ", env->observations[row_start + c]);
-    }
-    debug_printf("\n");
-
-    // Count left
-    int left_count = 0;
     for (int c = col - 1; c >= 0; c--) {
         if (env->observations[row_start + c] == CELL_BLACK) {
             run_length++;
-            left_count++;
         } else {
             break;
         }
     }
-    debug_printf("  Left count: %d\n", left_count);
 
-    // Count right
-    int right_count = 0;
     for (int c = col + 1; c < env->size; c++) {
         if (env->observations[row_start + c] == CELL_BLACK) {
             run_length++;
-            right_count++;
         } else {
             break;
         }
     }
-    debug_printf("  Right count: %d\n", right_count);
-    debug_printf("  Total run_length (1 + left + right): %d\n", run_length);
 
     return run_length;
 }
@@ -136,67 +102,35 @@ int get_row_run_length(Nonogram* env, int row, int col) {
 int get_col_run_length(Nonogram* env, int row, int col) {
     int run_length = 1;
 
-    debug_printf("  get_col_run_length: row=%d, col=%d\n", row, col);
-    debug_printf("  Col cells before marking: ");
-    for (int r = 0; r < env->size; r++) {
-        debug_printf("%d ", env->observations[r * MAX_SIZE + col]);
-    }
-    debug_printf("\n");
-
-    // Count up
-    int up_count = 0;
     for (int r = row - 1; r >= 0; r--) {
         if (env->observations[r * MAX_SIZE + col] == CELL_BLACK) {
             run_length++;
-            up_count++;
         } else {
             break;
         }
     }
-    debug_printf("  Up count: %d\n", up_count);
 
-    // Count down
-    int down_count = 0;
     for (int r = row + 1; r < env->size; r++) {
         if (env->observations[r * MAX_SIZE + col] == CELL_BLACK) {
             run_length++;
-            down_count++;
         } else {
             break;
         }
     }
-    debug_printf("  Down count: %d\n", down_count);
-    debug_printf("  Total run_length (1 + up + down): %d\n", run_length);
 
     return run_length;
 }
 
 int check_line_matches(unsigned char* line_data, unsigned char* clues, int num_runs, int size) {
-    debug_printf("  check_line_matches: num_runs=%d, size=%d\n", num_runs, size);
-    debug_printf("  Line data: ");
-    for (int i = 0; i < size; i++) {
-        debug_printf("%d ", line_data[i]);
-    }
-    debug_printf("\n");
-    debug_printf("  Expected clues: ");
-    for (int i = 0; i < num_runs; i++) {
-        debug_printf("%d ", clues[i]);
-    }
-    debug_printf("\n");
-
     int run_idx = 0;
     int count = 0;
 
     for (int i = 0; i < size; i++) {
         if (line_data[i] == CELL_BLACK) {
             count++;
-            debug_printf("  Position %d: BLACK, count=%d\n", i, count);
         } else if (line_data[i] == CELL_EMPTY || line_data[i] == CELL_WHITE) {
             if (count > 0) {
-                debug_printf("  End of run at position %d: count=%d, expected=%d (run_idx=%d)\n",
-                       i, count, clues[run_idx], run_idx);
                 if (clues[run_idx] != count) {
-                    debug_printf("  MISMATCH! Expected %d but got %d\n", clues[run_idx], count);
                     return 0;
                 }
                 run_idx++;
@@ -205,28 +139,20 @@ int check_line_matches(unsigned char* line_data, unsigned char* clues, int num_r
         }
     }
 
-    // Check final run
     if (count > 0) {
-        debug_printf("  Final run: count=%d, expected=%d (run_idx=%d)\n", count, clues[run_idx], run_idx);
         if (clues[run_idx] != count) {
-            debug_printf("  FINAL MISMATCH! Expected %d but got %d\n", clues[run_idx], count);
             return 0;
         }
         run_idx++;
     }
 
-    debug_printf("  Total runs found: %d, expected: %d\n", run_idx, num_runs);
-    int matches = (run_idx == num_runs);
-    debug_printf("  Pattern matches: %d\n", matches);
-    return matches;
+    return (run_idx == num_runs);
 }
 
-// Helper to generate random float in [0, 1]
 float rand_uniform() {
     return (float)rand() / (float)RAND_MAX;
 }
 
-// Required functions
 void c_reset(Nonogram* env) {
     env->size = env->min_size + (rand() % (env->max_size - env->min_size + 1));
     env->max_steps = env->easy_learn ? env->size * env->size : 4 * env->size * env->size;
@@ -234,18 +160,14 @@ void c_reset(Nonogram* env) {
     int full_grid_size = MAX_SIZE * MAX_SIZE;
     int max_clues = MAX_SIZE / 2;
 
-    // Initialize all grid as PADDING, then clear valid cells to EMPTY (using MAX_SIZE stride)
     memset(env->observations, CELL_PADDING, full_grid_size);
     for (int r = 0; r < env->size; r++) {
         for (int c = 0; c < env->size; c++) {
             env->observations[r * MAX_SIZE + c] = CELL_EMPTY;
         }
     }
-    // Clear clue areas
     memset(env->observations + full_grid_size, 0, 2 * MAX_SIZE * max_clues);
 
-    // Generate random solution using MAX_SIZE stride with uniform fill probability
-    // Sample fill probability p uniformly from [0, 1] for difficulty variation
     float fill_prob = rand_uniform();
     memset(env->solution, CELL_WHITE, MAX_SIZE * MAX_SIZE);
     int has_filled = 0;
@@ -258,18 +180,15 @@ void c_reset(Nonogram* env) {
         }
     }
 
-    // Ensure at least one square is set
     if (!has_filled) {
         int rand_row = rand() % env->size;
         int rand_col = rand() % env->size;
         env->solution[rand_row * MAX_SIZE + rand_col] = CELL_BLACK;
     }
 
-    // Reset clues arrays
     memset(env->rows_clues, 0, MAX_SIZE * MAX_CLUES);
     memset(env->cols_clues, 0, MAX_SIZE * MAX_CLUES);
 
-    // Calculate row clues
     for (int i = 0; i < env->size; i++) {
         int clue_idx = 0;
         int count = 0;
@@ -289,7 +208,6 @@ void c_reset(Nonogram* env) {
         env->rows_num_runs[i] = clue_idx;
     }
 
-    // Calculate column clues
     for (int j = 0; j < env->size; j++) {
         int clue_idx = 0;
         int count = 0;
@@ -309,14 +227,11 @@ void c_reset(Nonogram* env) {
         env->cols_num_runs[j] = clue_idx;
     }
 
-    // Store clues in observation
     memcpy(env->observations + full_grid_size, env->rows_clues, MAX_SIZE * max_clues);
     memcpy(env->observations + full_grid_size + MAX_SIZE * max_clues, env->cols_clues, MAX_SIZE * max_clues);
 
-    // Store board size as scalar at end of observation
     env->observations[full_grid_size + 2 * MAX_SIZE * max_clues] = env->size;
 
-    // Calculate max clues and target sums
     memset(env->rows_totals, 0, MAX_SIZE);
     memset(env->cols_totals, 0, MAX_SIZE);
     memset(env->rows_completed, 0, MAX_SIZE);
@@ -324,7 +239,6 @@ void c_reset(Nonogram* env) {
     env->filled_total = 0;
 
     for (int i = 0; i < env->size; i++) {
-        // Find max clue for row
         int max_clue = 0;
         int sum = 0;
         for (int j = 0; j < max_clues; j++) {
@@ -337,7 +251,6 @@ void c_reset(Nonogram* env) {
         env->rows_max_clue[i] = max_clue;
         env->rows_target_sum[i] = sum;
 
-        // Find max clue for col
         max_clue = 0;
         sum = 0;
         for (int j = 0; j < max_clues; j++) {
@@ -351,52 +264,11 @@ void c_reset(Nonogram* env) {
         env->cols_target_sum[i] = sum;
     }
 
-    // Calculate target total
     env->target_total = 0;
     for (int i = 0; i < env->size; i++) {
         env->target_total += env->rows_target_sum[i];
     }
 
-    // Debug: print solution and clues
-    debug_printf("\n=== RESET: New puzzle generated (size=%d) ===\n", env->size);
-    debug_printf("Solution grid:\n");
-    for (int r = 0; r < env->size; r++) {
-        debug_printf("  Row %d: ", r);
-        for (int c = 0; c < env->size; c++) {
-            debug_printf("%d ", env->solution[r * MAX_SIZE + c]);
-        }
-        debug_printf("\n");
-    }
-
-    debug_printf("\nRow clues:\n");
-    for (int r = 0; r < env->size; r++) {
-        debug_printf("  Row %d (num_runs=%d, target_sum=%d, max_clue=%d): ",
-               r, env->rows_num_runs[r], env->rows_target_sum[r], env->rows_max_clue[r]);
-        for (int i = 0; i < MAX_CLUES; i++) {
-            int clue = env->rows_clues[r * MAX_CLUES + i];
-            if (clue > 0) {
-                debug_printf("%d ", clue);
-            }
-        }
-        debug_printf("\n");
-    }
-
-    debug_printf("\nColumn clues:\n");
-    for (int c = 0; c < env->size; c++) {
-        debug_printf("  Col %d (num_runs=%d, target_sum=%d, max_clue=%d): ",
-               c, env->cols_num_runs[c], env->cols_target_sum[c], env->cols_max_clue[c]);
-        for (int i = 0; i < MAX_CLUES; i++) {
-            int clue = env->cols_clues[c * MAX_CLUES + i];
-            if (clue > 0) {
-                debug_printf("%d ", clue);
-            }
-        }
-        debug_printf("\n");
-    }
-
-    debug_printf("\nTarget total BLACK cells: %d\n", env->target_total);
-    debug_printf("===================================\n\n");
-
     env->steps_taken = 0;
     env->episode_reward = 0;
 }
@@ -409,11 +281,7 @@ void c_step(Nonogram* env) {
 
     env->steps_taken++;
 
-    debug_printf("DEBUG c_step: action=%d, steps=%d\n", action, env->steps_taken);
-
-    // Check timeout FIRST before any game logic
     if (env->steps_taken > env->max_steps) {
-        debug_printf("DEBUG: TIMEOUT\n");
         env->terminals[0] = 1;
         env->rewards[0] = REWARD_TIMEOUT;
         env->episode_reward += REWARD_TIMEOUT;
@@ -422,21 +290,13 @@ void c_step(Nonogram* env) {
         return;
     }
 
-    // Decode action: 0-63 = mark WHITE, 64-127 = mark BLACK
     int mark_black = action >= (MAX_SIZE * MAX_SIZE);
     int pos = action % (MAX_SIZE * MAX_SIZE);
 
-    debug_printf("DEBUG: mark_black=%d, pos=%d\n", mark_black, pos);
-
-    // Convert position to row/col using MAX_SIZE stride
     int row = pos / MAX_SIZE;
     int col = pos % MAX_SIZE;
 
-    debug_printf("DEBUG: row=%d, col=%d, size=%d\n", row, col, env->size);
-
-    // Check if action is out of bounds (hitting padding area)
     if (row >= env->size || col >= env->size) {
-        debug_printf("DEBUG: OUT OF BOUNDS (row=%d, col=%d >= size=%d)\n", row, col, env->size);
         env->terminals[0] = 1;
         env->rewards[0] = REWARD_OUT_OF_BOUNDS;
         env->episode_reward += REWARD_OUT_OF_BOUNDS;
@@ -447,12 +307,7 @@ void c_step(Nonogram* env) {
 
     unsigned char current = env->observations[pos];
 
-    debug_printf("DEBUG: current cell value=%d (EMPTY=%d, WHITE=%d, BLACK=%d, PADDING=%d)\n",
-           current, CELL_EMPTY, CELL_WHITE, CELL_BLACK, CELL_PADDING);
-
-    // Can't mark a cell that's already been marked
     if (current != CELL_EMPTY) {
-        debug_printf("DEBUG: INVALID - cell already marked (current=%d)\n", current);
         env->terminals[0] = 1;
         env->rewards[0] = REWARD_INVALID_MOVE;
         env->episode_reward += REWARD_INVALID_MOVE;
@@ -461,19 +316,9 @@ void c_step(Nonogram* env) {
         return;
     }
 
-    // Mark cell as BLACK or WHITE
     if (mark_black) {
-        debug_printf("DEBUG: Marking BLACK\n");
-        // Marking BLACK - check if valid
-        // First check: totals equal target - invalid move (terminate episode)
-        debug_printf("DEBUG: rows_totals[%d]=%d, rows_target_sum[%d]=%d\n",
-               row, env->rows_totals[row], row, env->rows_target_sum[row]);
-        debug_printf("DEBUG: cols_totals[%d]=%d, cols_target_sum[%d]=%d\n",
-               col, env->cols_totals[col], col, env->cols_target_sum[col]);
-
         if (env->rows_totals[row] == env->rows_target_sum[row] ||
             env->cols_totals[col] == env->cols_target_sum[col]) {
-            debug_printf("DEBUG: INVALID - row or col already full\n");
             env->terminals[0] = 1;
             env->rewards[0] = REWARD_INVALID_MOVE;
             env->episode_reward += REWARD_INVALID_MOVE;
@@ -482,13 +327,9 @@ void c_step(Nonogram* env) {
             return;
         }
 
-        // Check if marking this cell BLACK would create a run longer than max allowed
         int row_run = get_row_run_length(env, row, col);
-        debug_printf("DEBUG: row_run_length=%d, rows_max_clue[%d]=%d\n",
-               row_run, row, env->rows_max_clue[row]);
 
         if (row_run > env->rows_max_clue[row]) {
-            debug_printf("DEBUG: INVALID - row run too long\n");
             env->terminals[0] = 1;
             env->rewards[0] = REWARD_INVALID_MOVE;
             env->episode_reward += REWARD_INVALID_MOVE;
@@ -498,11 +339,8 @@ void c_step(Nonogram* env) {
         }
 
         int col_run = get_col_run_length(env, row, col);
-        debug_printf("DEBUG: col_run_length=%d, cols_max_clue[%d]=%d\n",
-               col_run, col, env->cols_max_clue[col]);
 
         if (col_run > env->cols_max_clue[col]) {
-            debug_printf("DEBUG: INVALID - col run too long\n");
             env->terminals[0] = 1;
             env->rewards[0] = REWARD_INVALID_MOVE;
             env->episode_reward += REWARD_INVALID_MOVE;
@@ -511,24 +349,16 @@ void c_step(Nonogram* env) {
             return;
         }
 
-        // Second check: if completing row/col, check runs match
         int row_completed = 0;
         int col_completed = 0;
 
-        debug_printf("DEBUG: Checking line completion...\n");
-
         if (env->rows_totals[row] == env->rows_target_sum[row] - 1) {
-            debug_printf("DEBUG: Would complete row %d, checking pattern...\n", row);
-            // Temporarily mark BLACK to check
             env->observations[pos] = CELL_BLACK;
             int row_start = row * MAX_SIZE;
             int matches = check_line_matches(env->observations + row_start,
                                    env->rows_clues + row * MAX_CLUES,
                                    env->rows_num_runs[row], env->size);
-            debug_printf("DEBUG: Row pattern matches: %d\n", matches);
             if (!matches) {
-                // Runs don't match - invalid move (terminate episode)
-                debug_printf("DEBUG: INVALID - row pattern doesn't match\n");
                 env->observations[pos] = CELL_EMPTY;
                 env->terminals[0] = 1;
                 env->rewards[0] = REWARD_NO_MATCH;
@@ -542,8 +372,6 @@ void c_step(Nonogram* env) {
         }
 
         if (env->cols_totals[col] == env->cols_target_sum[col] - 1) {
-            debug_printf("DEBUG: Would complete col %d, checking pattern...\n", col);
-            // Temporarily mark BLACK to check
             env->observations[pos] = CELL_BLACK;
             unsigned char col_data[MAX_SIZE];
             for (int i = 0; i < env->size; i++) {
@@ -552,10 +380,7 @@ void c_step(Nonogram* env) {
             int matches = check_line_matches(col_data,
                                    env->cols_clues + col * MAX_CLUES,
                                    env->cols_num_runs[col], env->size);
-            debug_printf("DEBUG: Col pattern matches: %d\n", matches);
             if (!matches) {
-                // Runs don't match - invalid move (terminate episode)
-                debug_printf("DEBUG: INVALID - col pattern doesn't match\n");
                 env->observations[pos] = CELL_EMPTY;
                 env->terminals[0] = 1;
                 env->rewards[0] = REWARD_NO_MATCH;
@@ -568,13 +393,11 @@ void c_step(Nonogram* env) {
             col_completed = 1;
         }
 
-        // Apply mark BLACK
         env->observations[pos] = CELL_BLACK;
         env->rows_totals[row]++;
         env->cols_totals[col]++;
         env->filled_total++;
 
-        // Give reward for newly completed lines only
         int row_newly_completed = row_completed && !env->rows_completed[row];
         int col_newly_completed = col_completed && !env->cols_completed[col];
 
@@ -585,21 +408,17 @@ void c_step(Nonogram* env) {
         env->rewards[0] += line_reward;
         env->episode_reward += line_reward;
     } else {
-        // Marking WHITE - always valid (just marks empty as not-black)
         env->observations[pos] = CELL_WHITE;
     }
 
-    // Easy learn mode: check if cell matches solution
     if (env->easy_learn) {
         unsigned char solution_cell = env->solution[pos];
         unsigned char actual = env->observations[pos];
 
         if (solution_cell == actual) {
-            // Correct move: give positive reward and continue
             env->rewards[0] += REWARD_EASY_LEARN_CORRECT;
             env->episode_reward += REWARD_EASY_LEARN_CORRECT;
         } else {
-            // Incorrect move: give negative reward, terminate and reset
             env->rewards[0] += REWARD_EASY_LEARN_INCORRECT;
             env->episode_reward += REWARD_EASY_LEARN_INCORRECT;
             env->terminals[0] = 1;
@@ -609,7 +428,6 @@ void c_step(Nonogram* env) {
         }
     }
 
-    // Check if solved (filled_total == target_total means all BLACK cells placed correctly)
     if (env->filled_total == env->target_total) {
         env->terminals[0] = 1;
         env->rewards[0] = REWARD_WIN;
diff --git a/pufferlib/ocean/torch.py b/pufferlib/ocean/torch.py
index 8da1a5935..726e9c6b9 100644
--- a/pufferlib/ocean/torch.py
+++ b/pufferlib/ocean/torch.py
@@ -953,9 +953,6 @@ def __init__(self, env, cnn_channels=32, input_size=128, hidden_size=128, **kwar
         self.hidden_size = hidden_size
         self.is_continuous = False
 
-        # Tetris-style architecture: multi-layer CNN for grid + separate scalar encoder
-
-        # Grid CNN (like Tetris): multiple conv layers with strides
         self.conv_grid = nn.Sequential(
             pufferlib.pytorch.layer_init(nn.Conv2d(4, cnn_channels, kernel_size=3, stride=1, padding=1)),
             nn.ReLU(),
@@ -967,7 +964,6 @@ def __init__(self, env, cnn_channels=32, input_size=128, hidden_size=128, **kwar
             pufferlib.pytorch.layer_init(nn.Linear(cnn_channels * 2 * 2, input_size)),
         )
 
-        # Separate encoders for row clues, column clues, and size (NO weight sharing)
         self.fc_row_clues = nn.Sequential(
             pufferlib.pytorch.layer_init(nn.Linear(8 * 4 * 9, input_size // 2)),
             nn.ReLU(),
@@ -983,14 +979,11 @@ def __init__(self, env, cnn_channels=32, input_size=128, hidden_size=128, **kwar
             nn.ReLU(),
         )
 
-        # Projection layer (like Tetris): combine grid and all scalar features
-        # input_size (grid) + input_size//2 (rows) + input_size//2 (cols) + input_size//4 (size) = 2.25 * input_size
         self.proj = nn.Sequential(
             pufferlib.pytorch.layer_init(nn.Linear(input_size + input_size // 2 + input_size // 2 + input_size // 4, hidden_size)),
             nn.ReLU(),
         )
 
-        # Output heads
         self.actor = pufferlib.pytorch.layer_init(
             nn.Linear(hidden_size, env.single_action_space.n), std=0.01)
         self.value_fn = pufferlib.pytorch.layer_init(
@@ -1007,23 +1000,18 @@ def forward_train(self, x, state=None):
     def encode_observations(self, observations, state=None):
         B = observations.shape[0]
 
-        # Parse observations
-        grid = F.one_hot(observations[:, :64].view(B, 8, 8).long(), 4).permute(0, 3, 1, 2).float()  # (B, 4, 8, 8)
-        row_clues = F.one_hot(observations[:, 64:96].view(B, 8, 4).long(), 9).float()  # (B, 8, 4, 9)
-        col_clues = F.one_hot(observations[:, 96:128].view(B, 8, 4).long(), 9).float()  # (B, 8, 4, 9)
-        board_size = F.one_hot(observations[:, 128].long(), 9).float()  # (B, 9)
-
-        # Process grid through CNN (Tetris-style)
-        grid_feat = self.conv_grid(grid)  # (B, input_size)
+        grid = F.one_hot(observations[:, :64].view(B, 8, 8).long(), 4).permute(0, 3, 1, 2).float()
+        row_clues = F.one_hot(observations[:, 64:96].view(B, 8, 4).long(), 9).float()
+        col_clues = F.one_hot(observations[:, 96:128].view(B, 8, 4).long(), 9).float()
+        board_size = F.one_hot(observations[:, 128].long(), 9).float()
 
-        # Process scalar features separately (NO weight sharing)
-        row_feat = self.fc_row_clues(row_clues.reshape(B, -1))  # (B, input_size//2)
-        col_feat = self.fc_col_clues(col_clues.reshape(B, -1))  # (B, input_size//2)
-        size_feat = self.fc_size(board_size)  # (B, input_size//4)
+        grid_feat = self.conv_grid(grid)
+        row_feat = self.fc_row_clues(row_clues.reshape(B, -1))
+        col_feat = self.fc_col_clues(col_clues.reshape(B, -1))
+        size_feat = self.fc_size(board_size)
 
-        # Combine and project (Tetris-style)
         combined = torch.cat([grid_feat, row_feat, col_feat, size_feat], dim=-1)
-        features = self.proj(combined)  # (B, hidden_size)
+        features = self.proj(combined)
 
         return features
 

From 565ea9f4d0758c5b5b599a20d1fa91eb5d65b3d9 Mon Sep 17 00:00:00 2001
From: Eitan Porat <eitan.porat@weizmann.ac.il>
Date: Tue, 21 Oct 2025 21:22:21 +0000
Subject: [PATCH 4/5] format code

---
 pufferlib/ocean/nonogram/binding.c  | 107 +--
 pufferlib/ocean/nonogram/nonogram.c |  38 +-
 pufferlib/ocean/nonogram/nonogram.h | 989 ++++++++++++++--------------
 3 files changed, 574 insertions(+), 560 deletions(-)

diff --git a/pufferlib/ocean/nonogram/binding.c b/pufferlib/ocean/nonogram/binding.c
index 8f63c9d9b..910f5e90f 100644
--- a/pufferlib/ocean/nonogram/binding.c
+++ b/pufferlib/ocean/nonogram/binding.c
@@ -1,75 +1,76 @@
-#include <Python.h>
 #include "nonogram.h"
+#include <Python.h>
 
 // Forward declare custom methods
-static PyObject* vec_get_solutions(PyObject* self, PyObject* args);
-static PyObject* vec_get_size(PyObject* self, PyObject* args);
+static PyObject *vec_get_solutions(PyObject *self, PyObject *args);
+static PyObject *vec_get_size(PyObject *self, PyObject *args);
 
 #define Env Nonogram
-#define MY_METHODS \
-    {"vec_get_solutions", vec_get_solutions, METH_VARARGS, "Get solutions from all environments"}, \
-    {"vec_get_size", vec_get_size, METH_VARARGS, "Get current board size"}
+#define MY_METHODS                                                             \
+  {"vec_get_solutions", vec_get_solutions, METH_VARARGS,                       \
+   "Get solutions from all environments"},                                     \
+      {"vec_get_size", vec_get_size, METH_VARARGS, "Get current board size"}
 
 #include "../env_binding.h"
 
-static int my_init(Env* env, PyObject* args, PyObject* kwargs) {
-    env->min_size = unpack(kwargs, "min_size");
-    env->max_size = unpack(kwargs, "max_size");
-    env->easy_learn = unpack(kwargs, "easy_learn");
-    env->size = env->max_size;
-    env->max_steps = 4 * env->max_size * env->max_size;
-    return 0;
+static int my_init(Env *env, PyObject *args, PyObject *kwargs) {
+  env->min_size = unpack(kwargs, "min_size");
+  env->max_size = unpack(kwargs, "max_size");
+  env->easy_learn = unpack(kwargs, "easy_learn");
+  env->size = env->max_size;
+  env->max_steps = 4 * env->max_size * env->max_size;
+  return 0;
 }
 
-static int my_log(PyObject* dict, Log* log) {
-    assign_to_dict(dict, "score", log->score);
-    assign_to_dict(dict, "episode_return", log->episode_return);
-    assign_to_dict(dict, "episode_length", log->episode_length);
-    assign_to_dict(dict, "solved", log->solved);
-    return 0;
+static int my_log(PyObject *dict, Log *log) {
+  assign_to_dict(dict, "score", log->score);
+  assign_to_dict(dict, "episode_return", log->episode_return);
+  assign_to_dict(dict, "episode_length", log->episode_length);
+  assign_to_dict(dict, "solved", log->solved);
+  return 0;
 }
 
 // Custom method to get solutions from all environments
-static PyObject* vec_get_solutions(PyObject* self, PyObject* args) {
-    if (PyTuple_Size(args) != 2) {
-        PyErr_SetString(PyExc_TypeError, "vec_get_solutions requires 2 arguments");
-        return NULL;
-    }
+static PyObject *vec_get_solutions(PyObject *self, PyObject *args) {
+  if (PyTuple_Size(args) != 2) {
+    PyErr_SetString(PyExc_TypeError, "vec_get_solutions requires 2 arguments");
+    return NULL;
+  }
 
-    VecEnv* vec = unpack_vecenv(args);
-    if (!vec) {
-        return NULL;
-    }
+  VecEnv *vec = unpack_vecenv(args);
+  if (!vec) {
+    return NULL;
+  }
 
-    PyObject* solutions_obj = PyTuple_GetItem(args, 1);
-    if (!PyObject_TypeCheck(solutions_obj, &PyArray_Type)) {
-        PyErr_SetString(PyExc_TypeError, "solutions must be a NumPy array");
-        return NULL;
-    }
-    PyArrayObject* solutions = (PyArrayObject*)solutions_obj;
-    if (!PyArray_ISCONTIGUOUS(solutions)) {
-        PyErr_SetString(PyExc_ValueError, "solutions must be contiguous");
-        return NULL;
-    }
+  PyObject *solutions_obj = PyTuple_GetItem(args, 1);
+  if (!PyObject_TypeCheck(solutions_obj, &PyArray_Type)) {
+    PyErr_SetString(PyExc_TypeError, "solutions must be a NumPy array");
+    return NULL;
+  }
+  PyArrayObject *solutions = (PyArrayObject *)solutions_obj;
+  if (!PyArray_ISCONTIGUOUS(solutions)) {
+    PyErr_SetString(PyExc_ValueError, "solutions must be contiguous");
+    return NULL;
+  }
 
-    // Copy solutions from each environment (always use max_size for buffer)
-    unsigned char* sol_ptr = PyArray_DATA(solutions);
-    int max_grid_size = MAX_SIZE * MAX_SIZE;
-    for (int i = 0; i < vec->num_envs; i++) {
-        Nonogram* env = vec->envs[i];
-        memcpy(sol_ptr + i * max_grid_size, env->solution, max_grid_size);
-    }
+  // Copy solutions from each environment (always use max_size for buffer)
+  unsigned char *sol_ptr = PyArray_DATA(solutions);
+  int max_grid_size = MAX_SIZE * MAX_SIZE;
+  for (int i = 0; i < vec->num_envs; i++) {
+    Nonogram *env = vec->envs[i];
+    memcpy(sol_ptr + i * max_grid_size, env->solution, max_grid_size);
+  }
 
-    Py_RETURN_NONE;
+  Py_RETURN_NONE;
 }
 
 // Get current board size from first environment
-static PyObject* vec_get_size(PyObject* self, PyObject* args) {
-    VecEnv* vec = unpack_vecenv(args);
-    if (!vec) {
-        return NULL;
-    }
+static PyObject *vec_get_size(PyObject *self, PyObject *args) {
+  VecEnv *vec = unpack_vecenv(args);
+  if (!vec) {
+    return NULL;
+  }
 
-    Nonogram* env = vec->envs[0];
-    return PyLong_FromLong(env->size);
+  Nonogram *env = vec->envs[0];
+  return PyLong_FromLong(env->size);
 }
diff --git a/pufferlib/ocean/nonogram/nonogram.c b/pufferlib/ocean/nonogram/nonogram.c
index 953be3dd1..91e07bbd4 100644
--- a/pufferlib/ocean/nonogram/nonogram.c
+++ b/pufferlib/ocean/nonogram/nonogram.c
@@ -6,27 +6,27 @@
 #include "nonogram.h"
 
 int main() {
-    Nonogram env = {.size = 8};
-    int max_clues = env.size / 2;
-    int obs_size = env.size * env.size + 2 * env.size * max_clues;
+  Nonogram env = {.size = 8};
+  int max_clues = env.size / 2;
+  int obs_size = env.size * env.size + 2 * env.size * max_clues;
 
-    env.max_steps = 4 * env.size * env.size;
-    env.observations = (unsigned char*)calloc(obs_size, sizeof(unsigned char));
-    env.actions = (int*)calloc(1, sizeof(int));
-    env.rewards = (float*)calloc(1, sizeof(float));
-    env.terminals = (unsigned char*)calloc(1, sizeof(unsigned char));
+  env.max_steps = 4 * env.size * env.size;
+  env.observations = (unsigned char *)calloc(obs_size, sizeof(unsigned char));
+  env.actions = (int *)calloc(1, sizeof(int));
+  env.rewards = (float *)calloc(1, sizeof(float));
+  env.terminals = (unsigned char *)calloc(1, sizeof(unsigned char));
 
-    c_reset(&env);
+  c_reset(&env);
+  c_render(&env);
+  while (!WindowShouldClose()) {
+    env.actions[0] = rand() % (env.size * env.size);
+    c_step(&env);
     c_render(&env);
-    while (!WindowShouldClose()) {
-        env.actions[0] = rand() % (env.size * env.size);
-        c_step(&env);
-        c_render(&env);
-    }
+  }
 
-    free(env.observations);
-    free(env.actions);
-    free(env.rewards);
-    free(env.terminals);
-    c_close(&env);
+  free(env.observations);
+  free(env.actions);
+  free(env.rewards);
+  free(env.terminals);
+  c_close(&env);
 }
diff --git a/pufferlib/ocean/nonogram/nonogram.h b/pufferlib/ocean/nonogram/nonogram.h
index f41f6244e..7ef7d9ed9 100644
--- a/pufferlib/ocean/nonogram/nonogram.h
+++ b/pufferlib/ocean/nonogram/nonogram.h
@@ -2,10 +2,10 @@
  * Players fill cells based on row and column clues (run-length encoding)
  */
 
+#include "raylib.h"
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include "raylib.h"
 
 #define MAX_SIZE 8
 #define MAX_CLUES (MAX_SIZE / 2)
@@ -25,569 +25,582 @@ const float REWARD_EASY_LEARN_INCORRECT = -0.01;
 const float REWARD_NO_MATCH = -0.05;
 
 typedef struct {
-    float score;
-    float episode_return;
-    float episode_length;
-    float solved;
-    float n;
+  float score;
+  float episode_return;
+  float episode_length;
+  float solved;
+  float n;
 } Log;
 
 typedef struct {
-    Log log;
-    unsigned char* observations;
-    int* actions;
-    float* rewards;
-    unsigned char* terminals;
-
-    int size;
-    int min_size;
-    int max_size;
-    int max_steps;
-    int steps_taken;
-    int filled_total;
-    int target_total;
-    int easy_learn;
-
-    unsigned char solution[MAX_SIZE * MAX_SIZE];
-
-    unsigned char rows_clues[MAX_SIZE * MAX_CLUES];
-    unsigned char cols_clues[MAX_SIZE * MAX_CLUES];
-    unsigned char rows_num_runs[MAX_SIZE];
-    unsigned char cols_num_runs[MAX_SIZE];
-    unsigned char rows_target_sum[MAX_SIZE];
-    unsigned char cols_target_sum[MAX_SIZE];
-    unsigned char rows_max_clue[MAX_SIZE];
-    unsigned char cols_max_clue[MAX_SIZE];
-
-    unsigned char rows_totals[MAX_SIZE];
-    unsigned char cols_totals[MAX_SIZE];
-
-    unsigned char rows_completed[MAX_SIZE];
-    unsigned char cols_completed[MAX_SIZE];
-
-    float episode_reward;
+  Log log;
+  unsigned char *observations;
+  int *actions;
+  float *rewards;
+  unsigned char *terminals;
+
+  int size;
+  int min_size;
+  int max_size;
+  int max_steps;
+  int steps_taken;
+  int filled_total;
+  int target_total;
+  int easy_learn;
+
+  unsigned char solution[MAX_SIZE * MAX_SIZE];
+
+  unsigned char rows_clues[MAX_SIZE * MAX_CLUES];
+  unsigned char cols_clues[MAX_SIZE * MAX_CLUES];
+  unsigned char rows_num_runs[MAX_SIZE];
+  unsigned char cols_num_runs[MAX_SIZE];
+  unsigned char rows_target_sum[MAX_SIZE];
+  unsigned char cols_target_sum[MAX_SIZE];
+  unsigned char rows_max_clue[MAX_SIZE];
+  unsigned char cols_max_clue[MAX_SIZE];
+
+  unsigned char rows_totals[MAX_SIZE];
+  unsigned char cols_totals[MAX_SIZE];
+
+  unsigned char rows_completed[MAX_SIZE];
+  unsigned char cols_completed[MAX_SIZE];
+
+  float episode_reward;
 } Nonogram;
 
-void add_log(Nonogram* env) {
-    env->log.score += env->rewards[0];
-    env->log.episode_length += env->steps_taken;
-    env->log.episode_return += env->episode_reward;
-    env->log.solved += (env->rewards[0] > 0) ? 1 : 0;
-    env->log.n++;
+void add_log(Nonogram *env) {
+  env->log.score += env->rewards[0];
+  env->log.episode_length += env->steps_taken;
+  env->log.episode_return += env->episode_reward;
+  env->log.solved += (env->rewards[0] > 0) ? 1 : 0;
+  env->log.n++;
 }
 
-int get_row_run_length(Nonogram* env, int row, int col) {
-    int row_start = row * MAX_SIZE;
-    int run_length = 1;
+int get_row_run_length(Nonogram *env, int row, int col) {
+  int row_start = row * MAX_SIZE;
+  int run_length = 1;
 
-    for (int c = col - 1; c >= 0; c--) {
-        if (env->observations[row_start + c] == CELL_BLACK) {
-            run_length++;
-        } else {
-            break;
-        }
+  for (int c = col - 1; c >= 0; c--) {
+    if (env->observations[row_start + c] == CELL_BLACK) {
+      run_length++;
+    } else {
+      break;
     }
+  }
 
-    for (int c = col + 1; c < env->size; c++) {
-        if (env->observations[row_start + c] == CELL_BLACK) {
-            run_length++;
-        } else {
-            break;
-        }
+  for (int c = col + 1; c < env->size; c++) {
+    if (env->observations[row_start + c] == CELL_BLACK) {
+      run_length++;
+    } else {
+      break;
     }
+  }
 
-    return run_length;
+  return run_length;
 }
 
-int get_col_run_length(Nonogram* env, int row, int col) {
-    int run_length = 1;
+int get_col_run_length(Nonogram *env, int row, int col) {
+  int run_length = 1;
 
-    for (int r = row - 1; r >= 0; r--) {
-        if (env->observations[r * MAX_SIZE + col] == CELL_BLACK) {
-            run_length++;
-        } else {
-            break;
-        }
+  for (int r = row - 1; r >= 0; r--) {
+    if (env->observations[r * MAX_SIZE + col] == CELL_BLACK) {
+      run_length++;
+    } else {
+      break;
     }
+  }
 
-    for (int r = row + 1; r < env->size; r++) {
-        if (env->observations[r * MAX_SIZE + col] == CELL_BLACK) {
-            run_length++;
-        } else {
-            break;
-        }
+  for (int r = row + 1; r < env->size; r++) {
+    if (env->observations[r * MAX_SIZE + col] == CELL_BLACK) {
+      run_length++;
+    } else {
+      break;
     }
+  }
 
-    return run_length;
+  return run_length;
 }
 
-int check_line_matches(unsigned char* line_data, unsigned char* clues, int num_runs, int size) {
-    int run_idx = 0;
-    int count = 0;
-
-    for (int i = 0; i < size; i++) {
-        if (line_data[i] == CELL_BLACK) {
-            count++;
-        } else if (line_data[i] == CELL_EMPTY || line_data[i] == CELL_WHITE) {
-            if (count > 0) {
-                if (clues[run_idx] != count) {
-                    return 0;
-                }
-                run_idx++;
-                count = 0;
-            }
-        }
-    }
+int check_line_matches(unsigned char *line_data, unsigned char *clues,
+                       int num_runs, int size) {
+  int run_idx = 0;
+  int count = 0;
 
-    if (count > 0) {
+  for (int i = 0; i < size; i++) {
+    if (line_data[i] == CELL_BLACK) {
+      count++;
+    } else if (line_data[i] == CELL_EMPTY || line_data[i] == CELL_WHITE) {
+      if (count > 0) {
         if (clues[run_idx] != count) {
-            return 0;
+          return 0;
         }
         run_idx++;
+        count = 0;
+      }
     }
+  }
 
-    return (run_idx == num_runs);
-}
+  if (count > 0) {
+    if (clues[run_idx] != count) {
+      return 0;
+    }
+    run_idx++;
+  }
 
-float rand_uniform() {
-    return (float)rand() / (float)RAND_MAX;
+  return (run_idx == num_runs);
 }
 
-void c_reset(Nonogram* env) {
-    env->size = env->min_size + (rand() % (env->max_size - env->min_size + 1));
-    env->max_steps = env->easy_learn ? env->size * env->size : 4 * env->size * env->size;
+float rand_uniform() { return (float)rand() / (float)RAND_MAX; }
 
-    int full_grid_size = MAX_SIZE * MAX_SIZE;
-    int max_clues = MAX_SIZE / 2;
+void c_reset(Nonogram *env) {
+  env->size = env->min_size + (rand() % (env->max_size - env->min_size + 1));
+  env->max_steps =
+      env->easy_learn ? env->size * env->size : 4 * env->size * env->size;
 
-    memset(env->observations, CELL_PADDING, full_grid_size);
-    for (int r = 0; r < env->size; r++) {
-        for (int c = 0; c < env->size; c++) {
-            env->observations[r * MAX_SIZE + c] = CELL_EMPTY;
-        }
-    }
-    memset(env->observations + full_grid_size, 0, 2 * MAX_SIZE * max_clues);
+  int full_grid_size = MAX_SIZE * MAX_SIZE;
+  int max_clues = MAX_SIZE / 2;
 
-    float fill_prob = rand_uniform();
-    memset(env->solution, CELL_WHITE, MAX_SIZE * MAX_SIZE);
-    int has_filled = 0;
-    for (int i = 0; i < env->size; i++) {
-        for (int j = 0; j < env->size; j++) {
-            if (rand_uniform() < fill_prob) {
-                env->solution[i * MAX_SIZE + j] = CELL_BLACK;
-                has_filled = 1;
-            }
-        }
+  memset(env->observations, CELL_PADDING, full_grid_size);
+  for (int r = 0; r < env->size; r++) {
+    for (int c = 0; c < env->size; c++) {
+      env->observations[r * MAX_SIZE + c] = CELL_EMPTY;
     }
+  }
+  memset(env->observations + full_grid_size, 0, 2 * MAX_SIZE * max_clues);
 
-    if (!has_filled) {
-        int rand_row = rand() % env->size;
-        int rand_col = rand() % env->size;
-        env->solution[rand_row * MAX_SIZE + rand_col] = CELL_BLACK;
+  float fill_prob = rand_uniform();
+  memset(env->solution, CELL_WHITE, MAX_SIZE * MAX_SIZE);
+  int has_filled = 0;
+  for (int i = 0; i < env->size; i++) {
+    for (int j = 0; j < env->size; j++) {
+      if (rand_uniform() < fill_prob) {
+        env->solution[i * MAX_SIZE + j] = CELL_BLACK;
+        has_filled = 1;
+      }
     }
+  }
 
-    memset(env->rows_clues, 0, MAX_SIZE * MAX_CLUES);
-    memset(env->cols_clues, 0, MAX_SIZE * MAX_CLUES);
+  if (!has_filled) {
+    int rand_row = rand() % env->size;
+    int rand_col = rand() % env->size;
+    env->solution[rand_row * MAX_SIZE + rand_col] = CELL_BLACK;
+  }
 
-    for (int i = 0; i < env->size; i++) {
-        int clue_idx = 0;
-        int count = 0;
-        for (int j = 0; j < env->size; j++) {
-            if (env->solution[i * MAX_SIZE + j] == CELL_BLACK) {
-                count++;
-            } else if (count > 0) {
-                env->rows_clues[i * MAX_CLUES + clue_idx] = count;
-                clue_idx++;
-                count = 0;
-            }
-        }
-        if (count > 0) {
-            env->rows_clues[i * MAX_CLUES + clue_idx] = count;
-            clue_idx++;
-        }
-        env->rows_num_runs[i] = clue_idx;
-    }
+  memset(env->rows_clues, 0, MAX_SIZE * MAX_CLUES);
+  memset(env->cols_clues, 0, MAX_SIZE * MAX_CLUES);
 
+  for (int i = 0; i < env->size; i++) {
+    int clue_idx = 0;
+    int count = 0;
     for (int j = 0; j < env->size; j++) {
-        int clue_idx = 0;
-        int count = 0;
-        for (int i = 0; i < env->size; i++) {
-            if (env->solution[i * MAX_SIZE + j] == CELL_BLACK) {
-                count++;
-            } else if (count > 0) {
-                env->cols_clues[j * MAX_CLUES + clue_idx] = count;
-                clue_idx++;
-                count = 0;
-            }
-        }
-        if (count > 0) {
-            env->cols_clues[j * MAX_CLUES + clue_idx] = count;
-            clue_idx++;
-        }
-        env->cols_num_runs[j] = clue_idx;
+      if (env->solution[i * MAX_SIZE + j] == CELL_BLACK) {
+        count++;
+      } else if (count > 0) {
+        env->rows_clues[i * MAX_CLUES + clue_idx] = count;
+        clue_idx++;
+        count = 0;
+      }
     }
-
-    memcpy(env->observations + full_grid_size, env->rows_clues, MAX_SIZE * max_clues);
-    memcpy(env->observations + full_grid_size + MAX_SIZE * max_clues, env->cols_clues, MAX_SIZE * max_clues);
-
-    env->observations[full_grid_size + 2 * MAX_SIZE * max_clues] = env->size;
-
-    memset(env->rows_totals, 0, MAX_SIZE);
-    memset(env->cols_totals, 0, MAX_SIZE);
-    memset(env->rows_completed, 0, MAX_SIZE);
-    memset(env->cols_completed, 0, MAX_SIZE);
-    env->filled_total = 0;
-
-    for (int i = 0; i < env->size; i++) {
-        int max_clue = 0;
-        int sum = 0;
-        for (int j = 0; j < max_clues; j++) {
-            int clue = env->rows_clues[i * MAX_CLUES + j];
-            if (clue > max_clue) {
-                max_clue = clue;
-            }
-            sum += clue;
-        }
-        env->rows_max_clue[i] = max_clue;
-        env->rows_target_sum[i] = sum;
-
-        max_clue = 0;
-        sum = 0;
-        for (int j = 0; j < max_clues; j++) {
-            int clue = env->cols_clues[i * MAX_CLUES + j];
-            if (clue > max_clue) {
-                max_clue = clue;
-            }
-            sum += clue;
-        }
-        env->cols_max_clue[i] = max_clue;
-        env->cols_target_sum[i] = sum;
+    if (count > 0) {
+      env->rows_clues[i * MAX_CLUES + clue_idx] = count;
+      clue_idx++;
     }
+    env->rows_num_runs[i] = clue_idx;
+  }
 
-    env->target_total = 0;
+  for (int j = 0; j < env->size; j++) {
+    int clue_idx = 0;
+    int count = 0;
     for (int i = 0; i < env->size; i++) {
-        env->target_total += env->rows_target_sum[i];
+      if (env->solution[i * MAX_SIZE + j] == CELL_BLACK) {
+        count++;
+      } else if (count > 0) {
+        env->cols_clues[j * MAX_CLUES + clue_idx] = count;
+        clue_idx++;
+        count = 0;
+      }
+    }
+    if (count > 0) {
+      env->cols_clues[j * MAX_CLUES + clue_idx] = count;
+      clue_idx++;
+    }
+    env->cols_num_runs[j] = clue_idx;
+  }
+
+  memcpy(env->observations + full_grid_size, env->rows_clues,
+         MAX_SIZE * max_clues);
+  memcpy(env->observations + full_grid_size + MAX_SIZE * max_clues,
+         env->cols_clues, MAX_SIZE * max_clues);
+
+  env->observations[full_grid_size + 2 * MAX_SIZE * max_clues] = env->size;
+
+  memset(env->rows_totals, 0, MAX_SIZE);
+  memset(env->cols_totals, 0, MAX_SIZE);
+  memset(env->rows_completed, 0, MAX_SIZE);
+  memset(env->cols_completed, 0, MAX_SIZE);
+  env->filled_total = 0;
+
+  for (int i = 0; i < env->size; i++) {
+    int max_clue = 0;
+    int sum = 0;
+    for (int j = 0; j < max_clues; j++) {
+      int clue = env->rows_clues[i * MAX_CLUES + j];
+      if (clue > max_clue) {
+        max_clue = clue;
+      }
+      sum += clue;
+    }
+    env->rows_max_clue[i] = max_clue;
+    env->rows_target_sum[i] = sum;
+
+    max_clue = 0;
+    sum = 0;
+    for (int j = 0; j < max_clues; j++) {
+      int clue = env->cols_clues[i * MAX_CLUES + j];
+      if (clue > max_clue) {
+        max_clue = clue;
+      }
+      sum += clue;
     }
+    env->cols_max_clue[i] = max_clue;
+    env->cols_target_sum[i] = sum;
+  }
 
-    env->steps_taken = 0;
-    env->episode_reward = 0;
-}
+  env->target_total = 0;
+  for (int i = 0; i < env->size; i++) {
+    env->target_total += env->rows_target_sum[i];
+  }
 
-void c_step(Nonogram* env) {
-    int action = env->actions[0];
+  env->steps_taken = 0;
+  env->episode_reward = 0;
+}
 
-    env->terminals[0] = 0;
-    env->rewards[0] = 0;
+void c_step(Nonogram *env) {
+  int action = env->actions[0];
+
+  env->terminals[0] = 0;
+  env->rewards[0] = 0;
+
+  env->steps_taken++;
+
+  if (env->steps_taken > env->max_steps) {
+    env->terminals[0] = 1;
+    env->rewards[0] = REWARD_TIMEOUT;
+    env->episode_reward += REWARD_TIMEOUT;
+    add_log(env);
+    c_reset(env);
+    return;
+  }
+
+  int mark_black = action >= (MAX_SIZE * MAX_SIZE);
+  int pos = action % (MAX_SIZE * MAX_SIZE);
+
+  int row = pos / MAX_SIZE;
+  int col = pos % MAX_SIZE;
+
+  if (row >= env->size || col >= env->size) {
+    env->terminals[0] = 1;
+    env->rewards[0] = REWARD_OUT_OF_BOUNDS;
+    env->episode_reward += REWARD_OUT_OF_BOUNDS;
+    add_log(env);
+    c_reset(env);
+    return;
+  }
+
+  unsigned char current = env->observations[pos];
+
+  if (current != CELL_EMPTY) {
+    env->terminals[0] = 1;
+    env->rewards[0] = REWARD_INVALID_MOVE;
+    env->episode_reward += REWARD_INVALID_MOVE;
+    add_log(env);
+    c_reset(env);
+    return;
+  }
+
+  if (mark_black) {
+    if (env->rows_totals[row] == env->rows_target_sum[row] ||
+        env->cols_totals[col] == env->cols_target_sum[col]) {
+      env->terminals[0] = 1;
+      env->rewards[0] = REWARD_INVALID_MOVE;
+      env->episode_reward += REWARD_INVALID_MOVE;
+      add_log(env);
+      c_reset(env);
+      return;
+    }
 
-    env->steps_taken++;
+    int row_run = get_row_run_length(env, row, col);
 
-    if (env->steps_taken > env->max_steps) {
-        env->terminals[0] = 1;
-        env->rewards[0] = REWARD_TIMEOUT;
-        env->episode_reward += REWARD_TIMEOUT;
-        add_log(env);
-        c_reset(env);
-        return;
+    if (row_run > env->rows_max_clue[row]) {
+      env->terminals[0] = 1;
+      env->rewards[0] = REWARD_INVALID_MOVE;
+      env->episode_reward += REWARD_INVALID_MOVE;
+      add_log(env);
+      c_reset(env);
+      return;
     }
 
-    int mark_black = action >= (MAX_SIZE * MAX_SIZE);
-    int pos = action % (MAX_SIZE * MAX_SIZE);
+    int col_run = get_col_run_length(env, row, col);
 
-    int row = pos / MAX_SIZE;
-    int col = pos % MAX_SIZE;
+    if (col_run > env->cols_max_clue[col]) {
+      env->terminals[0] = 1;
+      env->rewards[0] = REWARD_INVALID_MOVE;
+      env->episode_reward += REWARD_INVALID_MOVE;
+      add_log(env);
+      c_reset(env);
+      return;
+    }
 
-    if (row >= env->size || col >= env->size) {
+    int row_completed = 0;
+    int col_completed = 0;
+
+    if (env->rows_totals[row] == env->rows_target_sum[row] - 1) {
+      env->observations[pos] = CELL_BLACK;
+      int row_start = row * MAX_SIZE;
+      int matches = check_line_matches(env->observations + row_start,
+                                       env->rows_clues + row * MAX_CLUES,
+                                       env->rows_num_runs[row], env->size);
+      if (!matches) {
+        env->observations[pos] = CELL_EMPTY;
         env->terminals[0] = 1;
-        env->rewards[0] = REWARD_OUT_OF_BOUNDS;
-        env->episode_reward += REWARD_OUT_OF_BOUNDS;
+        env->rewards[0] = REWARD_NO_MATCH;
+        env->episode_reward += REWARD_NO_MATCH;
         add_log(env);
         c_reset(env);
         return;
+      }
+      env->observations[pos] = CELL_EMPTY;
+      row_completed = 1;
     }
 
-    unsigned char current = env->observations[pos];
-
-    if (current != CELL_EMPTY) {
+    if (env->cols_totals[col] == env->cols_target_sum[col] - 1) {
+      env->observations[pos] = CELL_BLACK;
+      unsigned char col_data[MAX_SIZE];
+      for (int i = 0; i < env->size; i++) {
+        col_data[i] = env->observations[i * MAX_SIZE + col];
+      }
+      int matches =
+          check_line_matches(col_data, env->cols_clues + col * MAX_CLUES,
+                             env->cols_num_runs[col], env->size);
+      if (!matches) {
+        env->observations[pos] = CELL_EMPTY;
         env->terminals[0] = 1;
-        env->rewards[0] = REWARD_INVALID_MOVE;
-        env->episode_reward += REWARD_INVALID_MOVE;
+        env->rewards[0] = REWARD_NO_MATCH;
+        env->episode_reward += REWARD_NO_MATCH;
         add_log(env);
         c_reset(env);
         return;
+      }
+      env->observations[pos] = CELL_EMPTY;
+      col_completed = 1;
     }
 
-    if (mark_black) {
-        if (env->rows_totals[row] == env->rows_target_sum[row] ||
-            env->cols_totals[col] == env->cols_target_sum[col]) {
-            env->terminals[0] = 1;
-            env->rewards[0] = REWARD_INVALID_MOVE;
-            env->episode_reward += REWARD_INVALID_MOVE;
-            add_log(env);
-            c_reset(env);
-            return;
-        }
-
-        int row_run = get_row_run_length(env, row, col);
-
-        if (row_run > env->rows_max_clue[row]) {
-            env->terminals[0] = 1;
-            env->rewards[0] = REWARD_INVALID_MOVE;
-            env->episode_reward += REWARD_INVALID_MOVE;
-            add_log(env);
-            c_reset(env);
-            return;
-        }
-
-        int col_run = get_col_run_length(env, row, col);
-
-        if (col_run > env->cols_max_clue[col]) {
-            env->terminals[0] = 1;
-            env->rewards[0] = REWARD_INVALID_MOVE;
-            env->episode_reward += REWARD_INVALID_MOVE;
-            add_log(env);
-            c_reset(env);
-            return;
-        }
-
-        int row_completed = 0;
-        int col_completed = 0;
-
-        if (env->rows_totals[row] == env->rows_target_sum[row] - 1) {
-            env->observations[pos] = CELL_BLACK;
-            int row_start = row * MAX_SIZE;
-            int matches = check_line_matches(env->observations + row_start,
-                                   env->rows_clues + row * MAX_CLUES,
-                                   env->rows_num_runs[row], env->size);
-            if (!matches) {
-                env->observations[pos] = CELL_EMPTY;
-                env->terminals[0] = 1;
-                env->rewards[0] = REWARD_NO_MATCH;
-                env->episode_reward += REWARD_NO_MATCH;
-                add_log(env);
-                c_reset(env);
-                return;
-            }
-            env->observations[pos] = CELL_EMPTY;
-            row_completed = 1;
-        }
-
-        if (env->cols_totals[col] == env->cols_target_sum[col] - 1) {
-            env->observations[pos] = CELL_BLACK;
-            unsigned char col_data[MAX_SIZE];
-            for (int i = 0; i < env->size; i++) {
-                col_data[i] = env->observations[i * MAX_SIZE + col];
-            }
-            int matches = check_line_matches(col_data,
-                                   env->cols_clues + col * MAX_CLUES,
-                                   env->cols_num_runs[col], env->size);
-            if (!matches) {
-                env->observations[pos] = CELL_EMPTY;
-                env->terminals[0] = 1;
-                env->rewards[0] = REWARD_NO_MATCH;
-                env->episode_reward += REWARD_NO_MATCH;
-                add_log(env);
-                c_reset(env);
-                return;
-            }
-            env->observations[pos] = CELL_EMPTY;
-            col_completed = 1;
-        }
-
-        env->observations[pos] = CELL_BLACK;
-        env->rows_totals[row]++;
-        env->cols_totals[col]++;
-        env->filled_total++;
-
-        int row_newly_completed = row_completed && !env->rows_completed[row];
-        int col_newly_completed = col_completed && !env->cols_completed[col];
-
-        if (row_newly_completed) env->rows_completed[row] = 1;
-        if (col_newly_completed) env->cols_completed[col] = 1;
-
-        float line_reward = (row_newly_completed + col_newly_completed) * REWARD_COMPLETE_LINE;
-        env->rewards[0] += line_reward;
-        env->episode_reward += line_reward;
+    env->observations[pos] = CELL_BLACK;
+    env->rows_totals[row]++;
+    env->cols_totals[col]++;
+    env->filled_total++;
+
+    int row_newly_completed = row_completed && !env->rows_completed[row];
+    int col_newly_completed = col_completed && !env->cols_completed[col];
+
+    if (row_newly_completed)
+      env->rows_completed[row] = 1;
+    if (col_newly_completed)
+      env->cols_completed[col] = 1;
+
+    float line_reward =
+        (row_newly_completed + col_newly_completed) * REWARD_COMPLETE_LINE;
+    env->rewards[0] += line_reward;
+    env->episode_reward += line_reward;
+  } else {
+    env->observations[pos] = CELL_WHITE;
+  }
+
+  if (env->easy_learn) {
+    unsigned char solution_cell = env->solution[pos];
+    unsigned char actual = env->observations[pos];
+
+    if (solution_cell == actual) {
+      env->rewards[0] += REWARD_EASY_LEARN_CORRECT;
+      env->episode_reward += REWARD_EASY_LEARN_CORRECT;
     } else {
-        env->observations[pos] = CELL_WHITE;
-    }
-
-    if (env->easy_learn) {
-        unsigned char solution_cell = env->solution[pos];
-        unsigned char actual = env->observations[pos];
-
-        if (solution_cell == actual) {
-            env->rewards[0] += REWARD_EASY_LEARN_CORRECT;
-            env->episode_reward += REWARD_EASY_LEARN_CORRECT;
-        } else {
-            env->rewards[0] += REWARD_EASY_LEARN_INCORRECT;
-            env->episode_reward += REWARD_EASY_LEARN_INCORRECT;
-            env->terminals[0] = 1;
-            add_log(env);
-            c_reset(env);
-            return;
-        }
-    }
-
-    if (env->filled_total == env->target_total) {
-        env->terminals[0] = 1;
-        env->rewards[0] = REWARD_WIN;
-        env->episode_reward += REWARD_WIN;
-        add_log(env);
-        c_reset(env);
-        return;
+      env->rewards[0] += REWARD_EASY_LEARN_INCORRECT;
+      env->episode_reward += REWARD_EASY_LEARN_INCORRECT;
+      env->terminals[0] = 1;
+      add_log(env);
+      c_reset(env);
+      return;
     }
+  }
+
+  if (env->filled_total == env->target_total) {
+    env->terminals[0] = 1;
+    env->rewards[0] = REWARD_WIN;
+    env->episode_reward += REWARD_WIN;
+    add_log(env);
+    c_reset(env);
+    return;
+  }
 }
 
-void c_render(Nonogram* env) {
-    if (!IsWindowReady()) {
-        int board_width = 120 + MAX_SIZE * 40;
-        int board_height = 120 + MAX_SIZE * 40;
-        int screen_width = board_width * 2 + 60 + 40;
-        int screen_height = board_height + 140;
-        InitWindow(screen_width, screen_height, "Nonogram (C)");
-        SetTargetFPS(60);
+void c_render(Nonogram *env) {
+  if (!IsWindowReady()) {
+    int board_width = 120 + MAX_SIZE * 40;
+    int board_height = 120 + MAX_SIZE * 40;
+    int screen_width = board_width * 2 + 60 + 40;
+    int screen_height = board_height + 140;
+    InitWindow(screen_width, screen_height, "Nonogram (C)");
+    SetTargetFPS(60);
+  }
+
+  if (IsKeyDown(KEY_ESCAPE)) {
+    exit(0);
+  }
+
+  BeginDrawing();
+  ClearBackground((Color){0, 0, 0, 255});
+
+  int cell_size = 40;
+  int clue_area = 120;
+  int board_spacing = 60;
+  int font_size = 20;
+
+  // Draw titles
+  DrawText("CURRENT BOARD", 20, 20, 24, RAYWHITE);
+  int solution_x = clue_area + env->size * cell_size + board_spacing + 20;
+  DrawText("SOLUTION", solution_x, 20, 24, RAYWHITE);
+
+  // Draw current board
+  int offset_x = 20;
+  int offset_y = 60;
+
+  // Draw column clues for current board
+  for (int clue_row = 0; clue_row < MAX_CLUES; clue_row++) {
+    for (int c = 0; c < env->size; c++) {
+      int clue = env->cols_clues[c * MAX_CLUES + clue_row];
+      if (clue > 0) {
+        char text[4];
+        snprintf(text, sizeof(text), "%d", clue);
+        int x = offset_x + clue_area + c * cell_size + cell_size / 2;
+        int y = offset_y + clue_row * 20 + 10;
+        int text_width = MeasureText(text, font_size);
+        DrawText(text, x - text_width / 2, y, font_size, RAYWHITE);
+      }
     }
-
-    if (IsKeyDown(KEY_ESCAPE)) {
-        exit(0);
+  }
+
+  // Draw row clues for current board
+  for (int r = 0; r < env->size; r++) {
+    int clue_x = offset_x + 10;
+    for (int clue_idx = 0; clue_idx < MAX_CLUES; clue_idx++) {
+      int clue = env->rows_clues[r * MAX_CLUES + clue_idx];
+      if (clue > 0) {
+        char text[4];
+        snprintf(text, sizeof(text), "%d", clue);
+        int y = offset_y + clue_area + r * cell_size + cell_size / 2 -
+                font_size / 2;
+        DrawText(text, clue_x, y, font_size, RAYWHITE);
+        clue_x += MeasureText(text, font_size) + 5;
+      }
     }
-
-    BeginDrawing();
-    ClearBackground((Color){0, 0, 0, 255});
-
-    int cell_size = 40;
-    int clue_area = 120;
-    int board_spacing = 60;
-    int font_size = 20;
-
-    // Draw titles
-    DrawText("CURRENT BOARD", 20, 20, 24, RAYWHITE);
-    int solution_x = clue_area + env->size * cell_size + board_spacing + 20;
-    DrawText("SOLUTION", solution_x, 20, 24, RAYWHITE);
-
-    // Draw current board
-    int offset_x = 20;
-    int offset_y = 60;
-
-    // Draw column clues for current board
-    for (int clue_row = 0; clue_row < MAX_CLUES; clue_row++) {
-        for (int c = 0; c < env->size; c++) {
-            int clue = env->cols_clues[c * MAX_CLUES + clue_row];
-            if (clue > 0) {
-                char text[4];
-                snprintf(text, sizeof(text), "%d", clue);
-                int x = offset_x + clue_area + c * cell_size + cell_size / 2;
-                int y = offset_y + clue_row * 20 + 10;
-                int text_width = MeasureText(text, font_size);
-                DrawText(text, x - text_width / 2, y, font_size, RAYWHITE);
-            }
-        }
+  }
+
+  // Draw current grid
+  for (int r = 0; r < env->size; r++) {
+    for (int c = 0; c < env->size; c++) {
+      int x = offset_x + clue_area + c * cell_size;
+      int y = offset_y + clue_area + r * cell_size;
+      int pos = r * MAX_SIZE + c;
+
+      if (env->observations[pos] == CELL_BLACK) {
+        DrawRectangle(x, y, cell_size, cell_size,
+                      (Color){50, 50, 50, 255}); // Dark gray for BLACK
+      } else if (env->observations[pos] == CELL_WHITE) {
+        DrawRectangle(x, y, cell_size, cell_size,
+                      (Color){240, 240, 240, 255}); // Light gray for WHITE
+      } else {
+        DrawRectangle(x, y, cell_size, cell_size,
+                      (Color){120, 120, 120, 255}); // Medium gray for EMPTY
+      }
+      DrawRectangleLines(x, y, cell_size, cell_size, LIGHTGRAY);
     }
-
-    // Draw row clues for current board
-    for (int r = 0; r < env->size; r++) {
-        int clue_x = offset_x + 10;
-        for (int clue_idx = 0; clue_idx < MAX_CLUES; clue_idx++) {
-            int clue = env->rows_clues[r * MAX_CLUES + clue_idx];
-            if (clue > 0) {
-                char text[4];
-                snprintf(text, sizeof(text), "%d", clue);
-                int y = offset_y + clue_area + r * cell_size + cell_size / 2 - font_size / 2;
-                DrawText(text, clue_x, y, font_size, RAYWHITE);
-                clue_x += MeasureText(text, font_size) + 5;
-            }
-        }
+  }
+
+  // Draw solution board
+  offset_x = solution_x;
+
+  // Draw column clues for solution
+  for (int clue_row = 0; clue_row < MAX_CLUES; clue_row++) {
+    for (int c = 0; c < env->size; c++) {
+      int clue = env->cols_clues[c * MAX_CLUES + clue_row];
+      if (clue > 0) {
+        char text[4];
+        snprintf(text, sizeof(text), "%d", clue);
+        int x = offset_x + clue_area + c * cell_size + cell_size / 2;
+        int y = offset_y + clue_row * 20 + 10;
+        int text_width = MeasureText(text, font_size);
+        DrawText(text, x - text_width / 2, y, font_size, RAYWHITE);
+      }
     }
-
-    // Draw current grid
-    for (int r = 0; r < env->size; r++) {
-        for (int c = 0; c < env->size; c++) {
-            int x = offset_x + clue_area + c * cell_size;
-            int y = offset_y + clue_area + r * cell_size;
-            int pos = r * MAX_SIZE + c;
-
-            if (env->observations[pos] == CELL_BLACK) {
-                DrawRectangle(x, y, cell_size, cell_size, (Color){50, 50, 50, 255});  // Dark gray for BLACK
-            } else if (env->observations[pos] == CELL_WHITE) {
-                DrawRectangle(x, y, cell_size, cell_size, (Color){240, 240, 240, 255});  // Light gray for WHITE
-            } else {
-                DrawRectangle(x, y, cell_size, cell_size, (Color){120, 120, 120, 255});  // Medium gray for EMPTY
-            }
-            DrawRectangleLines(x, y, cell_size, cell_size, LIGHTGRAY);
-        }
+  }
+
+  // Draw row clues for solution
+  for (int r = 0; r < env->size; r++) {
+    int clue_x = offset_x + 10;
+    for (int clue_idx = 0; clue_idx < MAX_CLUES; clue_idx++) {
+      int clue = env->rows_clues[r * MAX_CLUES + clue_idx];
+      if (clue > 0) {
+        char text[4];
+        snprintf(text, sizeof(text), "%d", clue);
+        int y = offset_y + clue_area + r * cell_size + cell_size / 2 -
+                font_size / 2;
+        DrawText(text, clue_x, y, font_size, RAYWHITE);
+        clue_x += MeasureText(text, font_size) + 5;
+      }
     }
-
-    // Draw solution board
-    offset_x = solution_x;
-
-    // Draw column clues for solution
-    for (int clue_row = 0; clue_row < MAX_CLUES; clue_row++) {
-        for (int c = 0; c < env->size; c++) {
-            int clue = env->cols_clues[c * MAX_CLUES + clue_row];
-            if (clue > 0) {
-                char text[4];
-                snprintf(text, sizeof(text), "%d", clue);
-                int x = offset_x + clue_area + c * cell_size + cell_size / 2;
-                int y = offset_y + clue_row * 20 + 10;
-                int text_width = MeasureText(text, font_size);
-                DrawText(text, x - text_width / 2, y, font_size, RAYWHITE);
-            }
-        }
+  }
+
+  // Draw solution grid
+  for (int r = 0; r < env->size; r++) {
+    for (int c = 0; c < env->size; c++) {
+      int x = offset_x + clue_area + c * cell_size;
+      int y = offset_y + clue_area + r * cell_size;
+      int pos = r * MAX_SIZE + c;
+
+      if (env->solution[pos] == CELL_BLACK) {
+        DrawRectangle(x, y, cell_size, cell_size, GREEN);
+      } else {
+        DrawRectangle(x, y, cell_size, cell_size, (Color){200, 200, 200, 255});
+      }
+      DrawRectangleLines(x, y, cell_size, cell_size, LIGHTGRAY);
     }
-
-    // Draw row clues for solution
-    for (int r = 0; r < env->size; r++) {
-        int clue_x = offset_x + 10;
-        for (int clue_idx = 0; clue_idx < MAX_CLUES; clue_idx++) {
-            int clue = env->rows_clues[r * MAX_CLUES + clue_idx];
-            if (clue > 0) {
-                char text[4];
-                snprintf(text, sizeof(text), "%d", clue);
-                int y = offset_y + clue_area + r * cell_size + cell_size / 2 - font_size / 2;
-                DrawText(text, clue_x, y, font_size, RAYWHITE);
-                clue_x += MeasureText(text, font_size) + 5;
-            }
-        }
-    }
-
-    // Draw solution grid
-    for (int r = 0; r < env->size; r++) {
-        for (int c = 0; c < env->size; c++) {
-            int x = offset_x + clue_area + c * cell_size;
-            int y = offset_y + clue_area + r * cell_size;
-            int pos = r * MAX_SIZE + c;
-
-            if (env->solution[pos] == CELL_BLACK) {
-                DrawRectangle(x, y, cell_size, cell_size, GREEN);
-            } else {
-                DrawRectangle(x, y, cell_size, cell_size, (Color){200, 200, 200, 255});
-            }
-            DrawRectangleLines(x, y, cell_size, cell_size, LIGHTGRAY);
-        }
-    }
-
-    // Draw status
-    int board_height = clue_area + env->size * cell_size;
-    int status_y = board_height + 80;
-    char status[128];
-    snprintf(status, sizeof(status), "Steps: %d/%d | Filled: %d/%d | Size: %dx%d",
-             env->steps_taken, env->max_steps, env->filled_total, env->target_total, env->size, env->size);
-    DrawText(status, 20, status_y, 20, RAYWHITE);
-
-    // Draw reward info
-    char reward_info[128];
-    snprintf(reward_info, sizeof(reward_info), "Last Reward: %.3f | Episode Return: %.3f",
-             env->rewards[0], env->episode_reward);
-    DrawText(reward_info, 20, status_y + 25, 20, RAYWHITE);
-
-    // Draw instructions
-    DrawText("Click cells to toggle | Press R to reset | ESC to quit", 20, status_y + 60, 16, LIGHTGRAY);
-
-    EndDrawing();
+  }
+
+  // Draw status
+  int board_height = clue_area + env->size * cell_size;
+  int status_y = board_height + 80;
+  char status[128];
+  snprintf(status, sizeof(status), "Steps: %d/%d | Filled: %d/%d | Size: %dx%d",
+           env->steps_taken, env->max_steps, env->filled_total,
+           env->target_total, env->size, env->size);
+  DrawText(status, 20, status_y, 20, RAYWHITE);
+
+  // Draw reward info
+  char reward_info[128];
+  snprintf(reward_info, sizeof(reward_info),
+           "Last Reward: %.3f | Episode Return: %.3f", env->rewards[0],
+           env->episode_reward);
+  DrawText(reward_info, 20, status_y + 25, 20, RAYWHITE);
+
+  // Draw instructions
+  DrawText("Click cells to toggle | Press R to reset | ESC to quit", 20,
+           status_y + 60, 16, LIGHTGRAY);
+
+  EndDrawing();
 }
 
-void c_close(Nonogram* env) {
-    if (IsWindowReady()) {
-        CloseWindow();
-    }
+void c_close(Nonogram *env) {
+  if (IsWindowReady()) {
+    CloseWindow();
+  }
 }

From d27638e60f8c499d3ffa08b53bbfb2c880891527 Mon Sep 17 00:00:00 2001
From: Eitan Porat <eitan.porat@weizmann.ac.il>
Date: Tue, 21 Oct 2025 22:24:34 +0000
Subject: [PATCH 5/5] minor

---
 pufferlib/config/ocean/nonogram.ini | 2 +-
 pufferlib/ocean/nonogram/nonogram.h | 3 +--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/pufferlib/config/ocean/nonogram.ini b/pufferlib/config/ocean/nonogram.ini
index 7f203644a..a503cf3d8 100644
--- a/pufferlib/config/ocean/nonogram.ini
+++ b/pufferlib/config/ocean/nonogram.ini
@@ -8,7 +8,7 @@ rnn_name = Recurrent
 num_envs = 4096
 min_size = 4
 max_size = 8
-easy_learn = 0
+easy_learn = 1
 
 [sweep]
 metric = score
diff --git a/pufferlib/ocean/nonogram/nonogram.h b/pufferlib/ocean/nonogram/nonogram.h
index 7ef7d9ed9..e14ced293 100644
--- a/pufferlib/ocean/nonogram/nonogram.h
+++ b/pufferlib/ocean/nonogram/nonogram.h
@@ -154,8 +154,7 @@ float rand_uniform() { return (float)rand() / (float)RAND_MAX; }
 
 void c_reset(Nonogram *env) {
   env->size = env->min_size + (rand() % (env->max_size - env->min_size + 1));
-  env->max_steps =
-      env->easy_learn ? env->size * env->size : 4 * env->size * env->size;
+  env->max_steps = env->size * env->size;
 
   int full_grid_size = MAX_SIZE * MAX_SIZE;
   int max_clues = MAX_SIZE / 2;