From 44bd80cecbc42d2e24dbdd2affabb2165960bba8 Mon Sep 17 00:00:00 2001 From: Eitan Porat Date: Tue, 21 Oct 2025 18:50:32 +0000 Subject: [PATCH 1/5] Add Nonogram puzzle environment - Add Nonogram C implementation (nonogram.c, nonogram.h, binding.c) - Add Python wrapper (nonogram.py) - Add Tetris-style neural network architecture for Nonogram in torch.py - Multi-layer CNN for grid with spatial downsampling - Separate encoders for row clues, column clues, and board size - No weight sharing between row and column encoders - Add configuration file (nonogram.ini) - Register NonogramLSTM and Nonogram policy classes --- pufferlib/config/ocean/nonogram.ini | 152 ++++++ pufferlib/ocean/nonogram/binding.c | 75 +++ pufferlib/ocean/nonogram/nonogram.c | 32 ++ pufferlib/ocean/nonogram/nonogram.h | 775 +++++++++++++++++++++++++++ pufferlib/ocean/nonogram/nonogram.py | 85 +++ pufferlib/ocean/torch.py | 91 ++++ 6 files changed, 1210 insertions(+) create mode 100644 pufferlib/config/ocean/nonogram.ini create mode 100644 pufferlib/ocean/nonogram/binding.c create mode 100644 pufferlib/ocean/nonogram/nonogram.c create mode 100644 pufferlib/ocean/nonogram/nonogram.h create mode 100644 pufferlib/ocean/nonogram/nonogram.py diff --git a/pufferlib/config/ocean/nonogram.ini b/pufferlib/config/ocean/nonogram.ini new file mode 100644 index 000000000..0c5d6ff44 --- /dev/null +++ b/pufferlib/config/ocean/nonogram.ini @@ -0,0 +1,152 @@ +[base] +package = ocean +env_name = puffer_nonogram +policy_name = Nonogram +; policy_name = Policy +rnn_name = Recurrent + +[env] +num_envs = 4096 +min_size = 4 +max_size = 8 +easy_learn = 1 + +[sweep] +metric = score + +[train] +; Hyperparameters from wandb config +name = pufferai +seed = 42 +gamma = 0.99965 +device = cuda +compile = False +project = ablations +use_rnn = True +vf_coef = 2.365 +adam_eps = 1.566e-10 +data_dir = experiments +ent_coef = 0.01554 +anneal_lr = True +clip_coef = 0.1267 +optimizer = muon +precision = float32 +adam_beta1 = 0.7912 +adam_beta2 = 0.999949 +batch_size = auto +gae_lambda = 0.9007 +prio_alpha = 0.7441 +prio_beta0 = 0.7365 +cpu_offload = False +bptt_horizon = 64 +compile_mode = max-autotune-no-cudagraphs +vf_clip_coef = 1.598 +learning_rate = 0.007103 +max_grad_norm = 1.275 +update_epochs = 1 +vtrace_c_clip = 0.8692 +minibatch_size = 32768 +total_timesteps = 2e10 +vtrace_rho_clip = 0.9074 +compile_fullgraph = True +max_minibatch_size = 32768 +checkpoint_interval = 200 +torch_deterministic = True + +; PREVIOUS RUN: Run ID 3k4cpz3ts3keprggvyldpo0bzpo2djsc +; total_timesteps = 1e10 +; minibatch_size = 65536 +; use_rnn = True +; update_epochs = 1 +; bptt_horizon = 64 +; gae_lambda = 0.9860112307817481 +; gamma = 0.9955237802885055 +; clip_coef = 0.3339182687952462 +; vf_coef = 1.3604733057894562 +; vf_clip_coef = 0.1 +; ent_coef = 0.01267345559258322 +; max_grad_norm = 0.7481994494317118 +; learning_rate = 0.0071601604548789605 +; adam_eps = 2.1466958248623007e-10 +; adam_beta1 = 0.9600776540257598 +; adam_beta2 = 0.9987918405974582 +; anneal_lr = True +; optimizer = muon +; prio_alpha = 0.9248668653880601 +; prio_beta0 = 0.9583638692801064 +; vtrace_c_clip = 2.931704492528996 +; vtrace_rho_clip = 1.2830763533710652 + +; PREVIOUS RUN: puffer_sweep_simple_policy +; total_timesteps = 1e10 +; minibatch_size = 16384 +; use_rnn = True +; update_epochs = 1 +; bptt_horizon = 64 +; gae_lambda = 0.9698 +; gamma = 0.9979 +; clip_coef = 0.1896 +; vf_coef = 1.4565 +; vf_clip_coef = 0.2296 +; ent_coef = 0.01257 +; max_grad_norm = 0.4804 +; learning_rate = 0.06449 +; adam_eps = 4.577e-10 +; adam_beta1 = 0.8184 +; adam_beta2 = 0.9996 +; anneal_lr = True +; optimizer = muon +; prio_alpha = 0.8445 +; prio_beta0 = 0.9498 +; vtrace_c_clip = 3.5953 +; vtrace_rho_clip = 2.2273 + +; PREVIOUS RUN: puffer_sweep_black_white_actions +; total_timesteps = 1e10 +; minibatch_size = 32768 +; use_rnn = True +; update_epochs = 1 +; bptt_horizon = 64 +; gae_lambda = 0.8645 +; gamma = 0.9991 +; clip_coef = 0.3043 +; vf_coef = 2.1905 +; vf_clip_coef = 3.1475 +; ent_coef = 0.002274 +; max_grad_norm = 1.0202 +; learning_rate = 0.007169 +; adam_eps = 6.036e-11 +; adam_beta1 = 0.9366 +; adam_beta2 = 0.9985 +; anneal_lr = True +; optimizer = muon +; prio_alpha = 0.8741 +; prio_beta0 = 0.7869 +; vtrace_c_clip = 1.6859 +; vtrace_rho_clip = 1.5254 +; total_timesteps = 10_000_000_000 +; minibatch_size = 32768 +; use_rnn = True +; update_epochs = 1 +; bptt_horizon = 64 +; gae_lambda = 0.6 +; gamma = 0.9999 +; clip_coef = 0.01 +; vf_coef = 4.453 +; vf_clip_coef = 0.1 +; ent_coef = 0.001160 +; max_grad_norm = 1.071 +; learning_rate = 0.003555 +; adam_eps = 1.675e-14 +; adam_beta1 = 0.9817 +; adam_beta2 = 0.9052 +; anneal_lr = True +; optimizer = muon +; compile = False +; precision = float32 +; torch_deterministic = True +; checkpoint_interval = 200 +; prio_alpha = 0.99 +; prio_beta0 = 0.855 +; vtrace_c_clip = 0.7794 +; vtrace_rho_clip = 0.8655 diff --git a/pufferlib/ocean/nonogram/binding.c b/pufferlib/ocean/nonogram/binding.c new file mode 100644 index 000000000..8f63c9d9b --- /dev/null +++ b/pufferlib/ocean/nonogram/binding.c @@ -0,0 +1,75 @@ +#include +#include "nonogram.h" + +// Forward declare custom methods +static PyObject* vec_get_solutions(PyObject* self, PyObject* args); +static PyObject* vec_get_size(PyObject* self, PyObject* args); + +#define Env Nonogram +#define MY_METHODS \ + {"vec_get_solutions", vec_get_solutions, METH_VARARGS, "Get solutions from all environments"}, \ + {"vec_get_size", vec_get_size, METH_VARARGS, "Get current board size"} + +#include "../env_binding.h" + +static int my_init(Env* env, PyObject* args, PyObject* kwargs) { + env->min_size = unpack(kwargs, "min_size"); + env->max_size = unpack(kwargs, "max_size"); + env->easy_learn = unpack(kwargs, "easy_learn"); + env->size = env->max_size; + env->max_steps = 4 * env->max_size * env->max_size; + return 0; +} + +static int my_log(PyObject* dict, Log* log) { + assign_to_dict(dict, "score", log->score); + assign_to_dict(dict, "episode_return", log->episode_return); + assign_to_dict(dict, "episode_length", log->episode_length); + assign_to_dict(dict, "solved", log->solved); + return 0; +} + +// Custom method to get solutions from all environments +static PyObject* vec_get_solutions(PyObject* self, PyObject* args) { + if (PyTuple_Size(args) != 2) { + PyErr_SetString(PyExc_TypeError, "vec_get_solutions requires 2 arguments"); + return NULL; + } + + VecEnv* vec = unpack_vecenv(args); + if (!vec) { + return NULL; + } + + PyObject* solutions_obj = PyTuple_GetItem(args, 1); + if (!PyObject_TypeCheck(solutions_obj, &PyArray_Type)) { + PyErr_SetString(PyExc_TypeError, "solutions must be a NumPy array"); + return NULL; + } + PyArrayObject* solutions = (PyArrayObject*)solutions_obj; + if (!PyArray_ISCONTIGUOUS(solutions)) { + PyErr_SetString(PyExc_ValueError, "solutions must be contiguous"); + return NULL; + } + + // Copy solutions from each environment (always use max_size for buffer) + unsigned char* sol_ptr = PyArray_DATA(solutions); + int max_grid_size = MAX_SIZE * MAX_SIZE; + for (int i = 0; i < vec->num_envs; i++) { + Nonogram* env = vec->envs[i]; + memcpy(sol_ptr + i * max_grid_size, env->solution, max_grid_size); + } + + Py_RETURN_NONE; +} + +// Get current board size from first environment +static PyObject* vec_get_size(PyObject* self, PyObject* args) { + VecEnv* vec = unpack_vecenv(args); + if (!vec) { + return NULL; + } + + Nonogram* env = vec->envs[0]; + return PyLong_FromLong(env->size); +} diff --git a/pufferlib/ocean/nonogram/nonogram.c b/pufferlib/ocean/nonogram/nonogram.c new file mode 100644 index 000000000..953be3dd1 --- /dev/null +++ b/pufferlib/ocean/nonogram/nonogram.c @@ -0,0 +1,32 @@ +/* Pure C demo file for Nonogram. Build it with: + * bash scripts/build_ocean.sh nonogram local (debug) + * bash scripts/build_ocean.sh nonogram fast + */ + +#include "nonogram.h" + +int main() { + Nonogram env = {.size = 8}; + int max_clues = env.size / 2; + int obs_size = env.size * env.size + 2 * env.size * max_clues; + + env.max_steps = 4 * env.size * env.size; + env.observations = (unsigned char*)calloc(obs_size, sizeof(unsigned char)); + env.actions = (int*)calloc(1, sizeof(int)); + env.rewards = (float*)calloc(1, sizeof(float)); + env.terminals = (unsigned char*)calloc(1, sizeof(unsigned char)); + + c_reset(&env); + c_render(&env); + while (!WindowShouldClose()) { + env.actions[0] = rand() % (env.size * env.size); + c_step(&env); + c_render(&env); + } + + free(env.observations); + free(env.actions); + free(env.rewards); + free(env.terminals); + c_close(&env); +} diff --git a/pufferlib/ocean/nonogram/nonogram.h b/pufferlib/ocean/nonogram/nonogram.h new file mode 100644 index 000000000..b4060a1de --- /dev/null +++ b/pufferlib/ocean/nonogram/nonogram.h @@ -0,0 +1,775 @@ +/* Nonogram: A logic puzzle environment + * Players fill cells based on row and column clues (run-length encoding) + */ + +#include +#include +#include +#include "raylib.h" + +// Debug mode: set to 1 to enable debug output, 0 to disable +#define DEBUG 0 + +#if DEBUG +#define debug_printf(...) printf(__VA_ARGS__) +#else +#define debug_printf(...) ((void)0) +#endif + +#define MAX_SIZE 8 +#define MAX_CLUES (MAX_SIZE / 2) + +const unsigned char CELL_EMPTY = 0; +const unsigned char CELL_WHITE = 1; +const unsigned char CELL_BLACK = 2; +const unsigned char CELL_PADDING = 3; + +const float REWARD_WIN = 1.0; +const float REWARD_INVALID_MOVE = -0.2; +const float REWARD_OUT_OF_BOUNDS = -0.2; +const float REWARD_TIMEOUT = -0.1; +const float REWARD_COMPLETE_LINE = 0.02; +const float REWARD_EASY_LEARN_CORRECT = 0.01; +const float REWARD_EASY_LEARN_INCORRECT = -0.01; +const float REWARD_NO_MATCH = -0.05; + +// Required struct for logging +typedef struct { + float score; + float episode_return; + float episode_length; + float solved; + float n; +} Log; + +// Nonogram environment struct +typedef struct { + Log log; + unsigned char* observations; + int* actions; + float* rewards; + unsigned char* terminals; + + // Environment state + int size; + int min_size; + int max_size; + int max_steps; + int steps_taken; + int filled_total; + int target_total; + int easy_learn; + + // Solution (for generating clues) + unsigned char solution[MAX_SIZE * MAX_SIZE]; + + // Clues + unsigned char rows_clues[MAX_SIZE * MAX_CLUES]; + unsigned char cols_clues[MAX_SIZE * MAX_CLUES]; + unsigned char rows_num_runs[MAX_SIZE]; + unsigned char cols_num_runs[MAX_SIZE]; + unsigned char rows_target_sum[MAX_SIZE]; + unsigned char cols_target_sum[MAX_SIZE]; + unsigned char rows_max_clue[MAX_SIZE]; + unsigned char cols_max_clue[MAX_SIZE]; + + // Current totals + unsigned char rows_totals[MAX_SIZE]; + unsigned char cols_totals[MAX_SIZE]; + + // Track completed lines + unsigned char rows_completed[MAX_SIZE]; + unsigned char cols_completed[MAX_SIZE]; + + // Episode reward accumulator + float episode_reward; +} Nonogram; + +// Helper function implementations +void add_log(Nonogram* env) { + env->log.score += env->rewards[0]; + env->log.episode_length += env->steps_taken; + env->log.episode_return += env->episode_reward; + env->log.solved += (env->rewards[0] > 0) ? 1 : 0; + env->log.n++; +} + +int get_row_run_length(Nonogram* env, int row, int col) { + int row_start = row * MAX_SIZE; + int run_length = 1; + + debug_printf(" get_row_run_length: row=%d, col=%d, row_start=%d\n", row, col, row_start); + debug_printf(" Row cells before marking: "); + for (int c = 0; c < env->size; c++) { + debug_printf("%d ", env->observations[row_start + c]); + } + debug_printf("\n"); + + // Count left + int left_count = 0; + for (int c = col - 1; c >= 0; c--) { + if (env->observations[row_start + c] == CELL_BLACK) { + run_length++; + left_count++; + } else { + break; + } + } + debug_printf(" Left count: %d\n", left_count); + + // Count right + int right_count = 0; + for (int c = col + 1; c < env->size; c++) { + if (env->observations[row_start + c] == CELL_BLACK) { + run_length++; + right_count++; + } else { + break; + } + } + debug_printf(" Right count: %d\n", right_count); + debug_printf(" Total run_length (1 + left + right): %d\n", run_length); + + return run_length; +} + +int get_col_run_length(Nonogram* env, int row, int col) { + int run_length = 1; + + debug_printf(" get_col_run_length: row=%d, col=%d\n", row, col); + debug_printf(" Col cells before marking: "); + for (int r = 0; r < env->size; r++) { + debug_printf("%d ", env->observations[r * MAX_SIZE + col]); + } + debug_printf("\n"); + + // Count up + int up_count = 0; + for (int r = row - 1; r >= 0; r--) { + if (env->observations[r * MAX_SIZE + col] == CELL_BLACK) { + run_length++; + up_count++; + } else { + break; + } + } + debug_printf(" Up count: %d\n", up_count); + + // Count down + int down_count = 0; + for (int r = row + 1; r < env->size; r++) { + if (env->observations[r * MAX_SIZE + col] == CELL_BLACK) { + run_length++; + down_count++; + } else { + break; + } + } + debug_printf(" Down count: %d\n", down_count); + debug_printf(" Total run_length (1 + up + down): %d\n", run_length); + + return run_length; +} + +int check_line_matches(unsigned char* line_data, unsigned char* clues, int num_runs, int size) { + debug_printf(" check_line_matches: num_runs=%d, size=%d\n", num_runs, size); + debug_printf(" Line data: "); + for (int i = 0; i < size; i++) { + debug_printf("%d ", line_data[i]); + } + debug_printf("\n"); + debug_printf(" Expected clues: "); + for (int i = 0; i < num_runs; i++) { + debug_printf("%d ", clues[i]); + } + debug_printf("\n"); + + int run_idx = 0; + int count = 0; + + for (int i = 0; i < size; i++) { + if (line_data[i] == CELL_BLACK) { + count++; + debug_printf(" Position %d: BLACK, count=%d\n", i, count); + } else if (line_data[i] == CELL_EMPTY || line_data[i] == CELL_WHITE) { + if (count > 0) { + debug_printf(" End of run at position %d: count=%d, expected=%d (run_idx=%d)\n", + i, count, clues[run_idx], run_idx); + if (clues[run_idx] != count) { + debug_printf(" MISMATCH! Expected %d but got %d\n", clues[run_idx], count); + return 0; + } + run_idx++; + count = 0; + } + } + } + + // Check final run + if (count > 0) { + debug_printf(" Final run: count=%d, expected=%d (run_idx=%d)\n", count, clues[run_idx], run_idx); + if (clues[run_idx] != count) { + debug_printf(" FINAL MISMATCH! Expected %d but got %d\n", clues[run_idx], count); + return 0; + } + run_idx++; + } + + debug_printf(" Total runs found: %d, expected: %d\n", run_idx, num_runs); + int matches = (run_idx == num_runs); + debug_printf(" Pattern matches: %d\n", matches); + return matches; +} + +// Helper to generate random float in [0, 1] +float rand_uniform() { + return (float)rand() / (float)RAND_MAX; +} + +// Required functions +void c_reset(Nonogram* env) { + env->size = env->min_size + (rand() % (env->max_size - env->min_size + 1)); + env->max_steps = env->easy_learn ? env->size * env->size : 4 * env->size * env->size; + + int full_grid_size = MAX_SIZE * MAX_SIZE; + int max_clues = MAX_SIZE / 2; + + // Initialize all grid as PADDING, then clear valid cells to EMPTY (using MAX_SIZE stride) + memset(env->observations, CELL_PADDING, full_grid_size); + for (int r = 0; r < env->size; r++) { + for (int c = 0; c < env->size; c++) { + env->observations[r * MAX_SIZE + c] = CELL_EMPTY; + } + } + // Clear clue areas + memset(env->observations + full_grid_size, 0, 2 * MAX_SIZE * max_clues); + + // Generate random solution using MAX_SIZE stride with uniform fill probability + // Sample fill probability p uniformly from [0, 1] for difficulty variation + float fill_prob = rand_uniform(); + memset(env->solution, CELL_WHITE, MAX_SIZE * MAX_SIZE); + int has_filled = 0; + for (int i = 0; i < env->size; i++) { + for (int j = 0; j < env->size; j++) { + if (rand_uniform() < fill_prob) { + env->solution[i * MAX_SIZE + j] = CELL_BLACK; + has_filled = 1; + } + } + } + + // Ensure at least one square is set + if (!has_filled) { + int rand_row = rand() % env->size; + int rand_col = rand() % env->size; + env->solution[rand_row * MAX_SIZE + rand_col] = CELL_BLACK; + } + + // Reset clues arrays + memset(env->rows_clues, 0, MAX_SIZE * MAX_CLUES); + memset(env->cols_clues, 0, MAX_SIZE * MAX_CLUES); + + // Calculate row clues + for (int i = 0; i < env->size; i++) { + int clue_idx = 0; + int count = 0; + for (int j = 0; j < env->size; j++) { + if (env->solution[i * MAX_SIZE + j] == CELL_BLACK) { + count++; + } else if (count > 0) { + env->rows_clues[i * MAX_CLUES + clue_idx] = count; + clue_idx++; + count = 0; + } + } + if (count > 0) { + env->rows_clues[i * MAX_CLUES + clue_idx] = count; + clue_idx++; + } + env->rows_num_runs[i] = clue_idx; + } + + // Calculate column clues + for (int j = 0; j < env->size; j++) { + int clue_idx = 0; + int count = 0; + for (int i = 0; i < env->size; i++) { + if (env->solution[i * MAX_SIZE + j] == CELL_BLACK) { + count++; + } else if (count > 0) { + env->cols_clues[j * MAX_CLUES + clue_idx] = count; + clue_idx++; + count = 0; + } + } + if (count > 0) { + env->cols_clues[j * MAX_CLUES + clue_idx] = count; + clue_idx++; + } + env->cols_num_runs[j] = clue_idx; + } + + // Store clues in observation + memcpy(env->observations + full_grid_size, env->rows_clues, MAX_SIZE * max_clues); + memcpy(env->observations + full_grid_size + MAX_SIZE * max_clues, env->cols_clues, MAX_SIZE * max_clues); + + // Store board size as scalar at end of observation + env->observations[full_grid_size + 2 * MAX_SIZE * max_clues] = env->size; + + // Calculate max clues and target sums + memset(env->rows_totals, 0, MAX_SIZE); + memset(env->cols_totals, 0, MAX_SIZE); + memset(env->rows_completed, 0, MAX_SIZE); + memset(env->cols_completed, 0, MAX_SIZE); + env->filled_total = 0; + + for (int i = 0; i < env->size; i++) { + // Find max clue for row + int max_clue = 0; + int sum = 0; + for (int j = 0; j < max_clues; j++) { + int clue = env->rows_clues[i * MAX_CLUES + j]; + if (clue > max_clue) { + max_clue = clue; + } + sum += clue; + } + env->rows_max_clue[i] = max_clue; + env->rows_target_sum[i] = sum; + + // Find max clue for col + max_clue = 0; + sum = 0; + for (int j = 0; j < max_clues; j++) { + int clue = env->cols_clues[i * MAX_CLUES + j]; + if (clue > max_clue) { + max_clue = clue; + } + sum += clue; + } + env->cols_max_clue[i] = max_clue; + env->cols_target_sum[i] = sum; + } + + // Calculate target total + env->target_total = 0; + for (int i = 0; i < env->size; i++) { + env->target_total += env->rows_target_sum[i]; + } + + // Debug: print solution and clues + debug_printf("\n=== RESET: New puzzle generated (size=%d) ===\n", env->size); + debug_printf("Solution grid:\n"); + for (int r = 0; r < env->size; r++) { + debug_printf(" Row %d: ", r); + for (int c = 0; c < env->size; c++) { + debug_printf("%d ", env->solution[r * MAX_SIZE + c]); + } + debug_printf("\n"); + } + + debug_printf("\nRow clues:\n"); + for (int r = 0; r < env->size; r++) { + debug_printf(" Row %d (num_runs=%d, target_sum=%d, max_clue=%d): ", + r, env->rows_num_runs[r], env->rows_target_sum[r], env->rows_max_clue[r]); + for (int i = 0; i < MAX_CLUES; i++) { + int clue = env->rows_clues[r * MAX_CLUES + i]; + if (clue > 0) { + debug_printf("%d ", clue); + } + } + debug_printf("\n"); + } + + debug_printf("\nColumn clues:\n"); + for (int c = 0; c < env->size; c++) { + debug_printf(" Col %d (num_runs=%d, target_sum=%d, max_clue=%d): ", + c, env->cols_num_runs[c], env->cols_target_sum[c], env->cols_max_clue[c]); + for (int i = 0; i < MAX_CLUES; i++) { + int clue = env->cols_clues[c * MAX_CLUES + i]; + if (clue > 0) { + debug_printf("%d ", clue); + } + } + debug_printf("\n"); + } + + debug_printf("\nTarget total BLACK cells: %d\n", env->target_total); + debug_printf("===================================\n\n"); + + env->steps_taken = 0; + env->episode_reward = 0; +} + +void c_step(Nonogram* env) { + int action = env->actions[0]; + + env->terminals[0] = 0; + env->rewards[0] = 0; + + env->steps_taken++; + + debug_printf("DEBUG c_step: action=%d, steps=%d\n", action, env->steps_taken); + + // Check timeout FIRST before any game logic + if (env->steps_taken > env->max_steps) { + debug_printf("DEBUG: TIMEOUT\n"); + env->terminals[0] = 1; + env->rewards[0] = REWARD_TIMEOUT; + env->episode_reward += REWARD_TIMEOUT; + add_log(env); + c_reset(env); + return; + } + + // Decode action: 0-63 = mark WHITE, 64-127 = mark BLACK + int mark_black = action >= (MAX_SIZE * MAX_SIZE); + int pos = action % (MAX_SIZE * MAX_SIZE); + + debug_printf("DEBUG: mark_black=%d, pos=%d\n", mark_black, pos); + + // Convert position to row/col using MAX_SIZE stride + int row = pos / MAX_SIZE; + int col = pos % MAX_SIZE; + + debug_printf("DEBUG: row=%d, col=%d, size=%d\n", row, col, env->size); + + // Check if action is out of bounds (hitting padding area) + if (row >= env->size || col >= env->size) { + debug_printf("DEBUG: OUT OF BOUNDS (row=%d, col=%d >= size=%d)\n", row, col, env->size); + env->terminals[0] = 1; + env->rewards[0] = REWARD_OUT_OF_BOUNDS; + env->episode_reward += REWARD_OUT_OF_BOUNDS; + add_log(env); + c_reset(env); + return; + } + + unsigned char current = env->observations[pos]; + + debug_printf("DEBUG: current cell value=%d (EMPTY=%d, WHITE=%d, BLACK=%d, PADDING=%d)\n", + current, CELL_EMPTY, CELL_WHITE, CELL_BLACK, CELL_PADDING); + + // Can't mark a cell that's already been marked + if (current != CELL_EMPTY) { + debug_printf("DEBUG: INVALID - cell already marked (current=%d)\n", current); + env->terminals[0] = 1; + env->rewards[0] = REWARD_INVALID_MOVE; + env->episode_reward += REWARD_INVALID_MOVE; + add_log(env); + c_reset(env); + return; + } + + // Mark cell as BLACK or WHITE + if (mark_black) { + debug_printf("DEBUG: Marking BLACK\n"); + // Marking BLACK - check if valid + // First check: totals equal target - invalid move (terminate episode) + debug_printf("DEBUG: rows_totals[%d]=%d, rows_target_sum[%d]=%d\n", + row, env->rows_totals[row], row, env->rows_target_sum[row]); + debug_printf("DEBUG: cols_totals[%d]=%d, cols_target_sum[%d]=%d\n", + col, env->cols_totals[col], col, env->cols_target_sum[col]); + + if (env->rows_totals[row] == env->rows_target_sum[row] || + env->cols_totals[col] == env->cols_target_sum[col]) { + debug_printf("DEBUG: INVALID - row or col already full\n"); + env->terminals[0] = 1; + env->rewards[0] = REWARD_INVALID_MOVE; + env->episode_reward += REWARD_INVALID_MOVE; + add_log(env); + c_reset(env); + return; + } + + // Check if marking this cell BLACK would create a run longer than max allowed + int row_run = get_row_run_length(env, row, col); + debug_printf("DEBUG: row_run_length=%d, rows_max_clue[%d]=%d\n", + row_run, row, env->rows_max_clue[row]); + + if (row_run > env->rows_max_clue[row]) { + debug_printf("DEBUG: INVALID - row run too long\n"); + env->terminals[0] = 1; + env->rewards[0] = REWARD_INVALID_MOVE; + env->episode_reward += REWARD_INVALID_MOVE; + add_log(env); + c_reset(env); + return; + } + + int col_run = get_col_run_length(env, row, col); + debug_printf("DEBUG: col_run_length=%d, cols_max_clue[%d]=%d\n", + col_run, col, env->cols_max_clue[col]); + + if (col_run > env->cols_max_clue[col]) { + debug_printf("DEBUG: INVALID - col run too long\n"); + env->terminals[0] = 1; + env->rewards[0] = REWARD_INVALID_MOVE; + env->episode_reward += REWARD_INVALID_MOVE; + add_log(env); + c_reset(env); + return; + } + + // Second check: if completing row/col, check runs match + int row_completed = 0; + int col_completed = 0; + + debug_printf("DEBUG: Checking line completion...\n"); + + if (env->rows_totals[row] == env->rows_target_sum[row] - 1) { + debug_printf("DEBUG: Would complete row %d, checking pattern...\n", row); + // Temporarily mark BLACK to check + env->observations[pos] = CELL_BLACK; + int row_start = row * MAX_SIZE; + int matches = check_line_matches(env->observations + row_start, + env->rows_clues + row * MAX_CLUES, + env->rows_num_runs[row], env->size); + debug_printf("DEBUG: Row pattern matches: %d\n", matches); + if (!matches) { + // Runs don't match - invalid move (terminate episode) + debug_printf("DEBUG: INVALID - row pattern doesn't match\n"); + env->observations[pos] = CELL_EMPTY; + env->terminals[0] = 1; + env->rewards[0] = REWARD_NO_MATCH; + env->episode_reward += REWARD_NO_MATCH; + add_log(env); + c_reset(env); + return; + } + env->observations[pos] = CELL_EMPTY; + row_completed = 1; + } + + if (env->cols_totals[col] == env->cols_target_sum[col] - 1) { + debug_printf("DEBUG: Would complete col %d, checking pattern...\n", col); + // Temporarily mark BLACK to check + env->observations[pos] = CELL_BLACK; + unsigned char col_data[MAX_SIZE]; + for (int i = 0; i < env->size; i++) { + col_data[i] = env->observations[i * MAX_SIZE + col]; + } + int matches = check_line_matches(col_data, + env->cols_clues + col * MAX_CLUES, + env->cols_num_runs[col], env->size); + debug_printf("DEBUG: Col pattern matches: %d\n", matches); + if (!matches) { + // Runs don't match - invalid move (terminate episode) + debug_printf("DEBUG: INVALID - col pattern doesn't match\n"); + env->observations[pos] = CELL_EMPTY; + env->terminals[0] = 1; + env->rewards[0] = REWARD_NO_MATCH; + env->episode_reward += REWARD_NO_MATCH; + add_log(env); + c_reset(env); + return; + } + env->observations[pos] = CELL_EMPTY; + col_completed = 1; + } + + // Apply mark BLACK + env->observations[pos] = CELL_BLACK; + env->rows_totals[row]++; + env->cols_totals[col]++; + env->filled_total++; + + // Give reward for newly completed lines only + int row_newly_completed = row_completed && !env->rows_completed[row]; + int col_newly_completed = col_completed && !env->cols_completed[col]; + + if (row_newly_completed) env->rows_completed[row] = 1; + if (col_newly_completed) env->cols_completed[col] = 1; + + float line_reward = (row_newly_completed + col_newly_completed) * REWARD_COMPLETE_LINE; + env->rewards[0] += line_reward; + env->episode_reward += line_reward; + } else { + // Marking WHITE - always valid (just marks empty as not-black) + env->observations[pos] = CELL_WHITE; + } + + // Easy learn mode: check if cell matches solution + if (env->easy_learn) { + unsigned char solution_cell = env->solution[pos]; + unsigned char actual = env->observations[pos]; + + if (solution_cell == actual) { + // Correct move: give positive reward and continue + env->rewards[0] += REWARD_EASY_LEARN_CORRECT; + env->episode_reward += REWARD_EASY_LEARN_CORRECT; + } else { + // Incorrect move: give negative reward, terminate and reset + env->rewards[0] += REWARD_EASY_LEARN_INCORRECT; + env->episode_reward += REWARD_EASY_LEARN_INCORRECT; + env->terminals[0] = 1; + add_log(env); + c_reset(env); + return; + } + } + + // Check if solved (filled_total == target_total means all BLACK cells placed correctly) + if (env->filled_total == env->target_total) { + env->terminals[0] = 1; + env->rewards[0] = REWARD_WIN; + env->episode_reward += REWARD_WIN; + add_log(env); + c_reset(env); + return; + } +} + +void c_render(Nonogram* env) { + if (!IsWindowReady()) { + int board_width = 120 + MAX_SIZE * 40; + int board_height = 120 + MAX_SIZE * 40; + int screen_width = board_width * 2 + 60 + 40; + int screen_height = board_height + 140; + InitWindow(screen_width, screen_height, "Nonogram (C)"); + SetTargetFPS(60); + } + + if (IsKeyDown(KEY_ESCAPE)) { + exit(0); + } + + BeginDrawing(); + ClearBackground((Color){0, 0, 0, 255}); + + int cell_size = 40; + int clue_area = 120; + int board_spacing = 60; + int font_size = 20; + + // Draw titles + DrawText("CURRENT BOARD", 20, 20, 24, RAYWHITE); + int solution_x = clue_area + env->size * cell_size + board_spacing + 20; + DrawText("SOLUTION", solution_x, 20, 24, RAYWHITE); + + // Draw current board + int offset_x = 20; + int offset_y = 60; + + // Draw column clues for current board + for (int clue_row = 0; clue_row < MAX_CLUES; clue_row++) { + for (int c = 0; c < env->size; c++) { + int clue = env->cols_clues[c * MAX_CLUES + clue_row]; + if (clue > 0) { + char text[4]; + snprintf(text, sizeof(text), "%d", clue); + int x = offset_x + clue_area + c * cell_size + cell_size / 2; + int y = offset_y + clue_row * 20 + 10; + int text_width = MeasureText(text, font_size); + DrawText(text, x - text_width / 2, y, font_size, RAYWHITE); + } + } + } + + // Draw row clues for current board + for (int r = 0; r < env->size; r++) { + int clue_x = offset_x + 10; + for (int clue_idx = 0; clue_idx < MAX_CLUES; clue_idx++) { + int clue = env->rows_clues[r * MAX_CLUES + clue_idx]; + if (clue > 0) { + char text[4]; + snprintf(text, sizeof(text), "%d", clue); + int y = offset_y + clue_area + r * cell_size + cell_size / 2 - font_size / 2; + DrawText(text, clue_x, y, font_size, RAYWHITE); + clue_x += MeasureText(text, font_size) + 5; + } + } + } + + // Draw current grid + for (int r = 0; r < env->size; r++) { + for (int c = 0; c < env->size; c++) { + int x = offset_x + clue_area + c * cell_size; + int y = offset_y + clue_area + r * cell_size; + int pos = r * MAX_SIZE + c; + + if (env->observations[pos] == CELL_BLACK) { + DrawRectangle(x, y, cell_size, cell_size, (Color){50, 50, 50, 255}); // Dark gray for BLACK + } else if (env->observations[pos] == CELL_WHITE) { + DrawRectangle(x, y, cell_size, cell_size, (Color){240, 240, 240, 255}); // Light gray for WHITE + } else { + DrawRectangle(x, y, cell_size, cell_size, (Color){120, 120, 120, 255}); // Medium gray for EMPTY + } + DrawRectangleLines(x, y, cell_size, cell_size, LIGHTGRAY); + } + } + + // Draw solution board + offset_x = solution_x; + + // Draw column clues for solution + for (int clue_row = 0; clue_row < MAX_CLUES; clue_row++) { + for (int c = 0; c < env->size; c++) { + int clue = env->cols_clues[c * MAX_CLUES + clue_row]; + if (clue > 0) { + char text[4]; + snprintf(text, sizeof(text), "%d", clue); + int x = offset_x + clue_area + c * cell_size + cell_size / 2; + int y = offset_y + clue_row * 20 + 10; + int text_width = MeasureText(text, font_size); + DrawText(text, x - text_width / 2, y, font_size, RAYWHITE); + } + } + } + + // Draw row clues for solution + for (int r = 0; r < env->size; r++) { + int clue_x = offset_x + 10; + for (int clue_idx = 0; clue_idx < MAX_CLUES; clue_idx++) { + int clue = env->rows_clues[r * MAX_CLUES + clue_idx]; + if (clue > 0) { + char text[4]; + snprintf(text, sizeof(text), "%d", clue); + int y = offset_y + clue_area + r * cell_size + cell_size / 2 - font_size / 2; + DrawText(text, clue_x, y, font_size, RAYWHITE); + clue_x += MeasureText(text, font_size) + 5; + } + } + } + + // Draw solution grid + for (int r = 0; r < env->size; r++) { + for (int c = 0; c < env->size; c++) { + int x = offset_x + clue_area + c * cell_size; + int y = offset_y + clue_area + r * cell_size; + int pos = r * MAX_SIZE + c; + + if (env->solution[pos] == CELL_BLACK) { + DrawRectangle(x, y, cell_size, cell_size, GREEN); + } else { + DrawRectangle(x, y, cell_size, cell_size, (Color){200, 200, 200, 255}); + } + DrawRectangleLines(x, y, cell_size, cell_size, LIGHTGRAY); + } + } + + // Draw status + int board_height = clue_area + env->size * cell_size; + int status_y = board_height + 80; + char status[128]; + snprintf(status, sizeof(status), "Steps: %d/%d | Filled: %d/%d | Size: %dx%d", + env->steps_taken, env->max_steps, env->filled_total, env->target_total, env->size, env->size); + DrawText(status, 20, status_y, 20, RAYWHITE); + + // Draw reward info + char reward_info[128]; + snprintf(reward_info, sizeof(reward_info), "Last Reward: %.3f | Episode Return: %.3f", + env->rewards[0], env->episode_reward); + DrawText(reward_info, 20, status_y + 25, 20, RAYWHITE); + + // Draw instructions + DrawText("Click cells to toggle | Press R to reset | ESC to quit", 20, status_y + 60, 16, LIGHTGRAY); + + EndDrawing(); +} + +void c_close(Nonogram* env) { + if (IsWindowReady()) { + CloseWindow(); + } +} diff --git a/pufferlib/ocean/nonogram/nonogram.py b/pufferlib/ocean/nonogram/nonogram.py new file mode 100644 index 000000000..c58aa1180 --- /dev/null +++ b/pufferlib/ocean/nonogram/nonogram.py @@ -0,0 +1,85 @@ +'''Nonogram logic puzzle environment''' + +import gymnasium +import numpy as np + +import pufferlib +from pufferlib.ocean.nonogram import binding + +MAX_SIZE = 8 +MIN_SIZE = 4 +MAX_CLUES = MAX_SIZE // 2 +OBS_SIZE = MAX_SIZE * MAX_SIZE + 2 * MAX_SIZE * MAX_CLUES + 1 # +1 for board size + +class Nonogram(pufferlib.PufferEnv): + def __init__(self, num_envs=1, render_mode=None, log_interval=128, + min_size=4, max_size=8, easy_learn=0, buf=None, seed=0): + # Observation space: grid cells (0-3: EMPTY/WHITE/BLACK/PADDING), clues (0-max_size), size encoding (0-1) + # Using max_size as high covers all values + self.single_observation_space = gymnasium.spaces.Box(low=0, high=max_size, + shape=(OBS_SIZE,), dtype=np.uint8) + # Action space: 0-63 = mark WHITE, 64-127 = mark BLACK + self.single_action_space = gymnasium.spaces.Discrete(MAX_SIZE * MAX_SIZE * 2) + self.render_mode = render_mode + self.num_agents = num_envs + self.log_interval = log_interval + + super().__init__(buf) + self.c_envs = binding.vec_init(self.observations, self.actions, self.rewards, + self.terminals, self.truncations, num_envs, seed, + min_size=min_size, max_size=max_size, easy_learn=easy_learn) + + self.solutions = np.zeros((num_envs, max_size * max_size), dtype=np.uint8) + + def reset(self, seed=0): + binding.vec_reset(self.c_envs, seed) + self.tick = 0 + return self.observations, [] + + def step(self, actions): + self.tick += 1 + + self.actions[:] = actions + binding.vec_step(self.c_envs) + + info = [] + if self.tick % self.log_interval == 0: + info.append(binding.vec_log(self.c_envs)) + + return (self.observations, self.rewards, + self.terminals, self.truncations, info) + + def render(self): + binding.vec_render(self.c_envs, 0) + + def close(self): + binding.vec_close(self.c_envs) + + def get_solutions(self): + """Get the solution grids for all environments""" + binding.vec_get_solutions(self.c_envs, self.solutions) + return self.solutions + + def get_size(self): + """Get current board size""" + return binding.vec_get_size(self.c_envs) + +if __name__ == '__main__': + N = 4096 + + env = Nonogram(num_envs=N, min_size=2, max_size=8) + env.reset() + steps = 0 + + CACHE = 1024 + actions = np.random.randint(0, 64, (CACHE, N)) + + i = 0 + import time + start = time.time() + while time.time() - start < 10: + env.step(actions[i % CACHE]) + steps += N + i += 1 + + print('Nonogram SPS:', int(steps / (time.time() - start))) diff --git a/pufferlib/ocean/torch.py b/pufferlib/ocean/torch.py index c414acde2..8da1a5935 100644 --- a/pufferlib/ocean/torch.py +++ b/pufferlib/ocean/torch.py @@ -942,6 +942,97 @@ def decode_actions(self, hidden): value = self.value_fn(hidden) # (B, 1) return action, value +class NonogramLSTM(pufferlib.models.LSTMWrapper): + def __init__(self, env, policy, input_size=256, hidden_size=256): + super().__init__(env, policy, input_size, hidden_size) + + +class Nonogram(nn.Module): + def __init__(self, env, cnn_channels=32, input_size=128, hidden_size=128, **kwargs): + super().__init__() + self.hidden_size = hidden_size + self.is_continuous = False + + # Tetris-style architecture: multi-layer CNN for grid + separate scalar encoder + + # Grid CNN (like Tetris): multiple conv layers with strides + self.conv_grid = nn.Sequential( + pufferlib.pytorch.layer_init(nn.Conv2d(4, cnn_channels, kernel_size=3, stride=1, padding=1)), + nn.ReLU(), + pufferlib.pytorch.layer_init(nn.Conv2d(cnn_channels, cnn_channels, kernel_size=3, stride=2, padding=1)), + nn.ReLU(), + pufferlib.pytorch.layer_init(nn.Conv2d(cnn_channels, cnn_channels, kernel_size=3, stride=2, padding=1)), + nn.ReLU(), + nn.Flatten(), + pufferlib.pytorch.layer_init(nn.Linear(cnn_channels * 2 * 2, input_size)), + ) + + # Separate encoders for row clues, column clues, and size (NO weight sharing) + self.fc_row_clues = nn.Sequential( + pufferlib.pytorch.layer_init(nn.Linear(8 * 4 * 9, input_size // 2)), + nn.ReLU(), + ) + + self.fc_col_clues = nn.Sequential( + pufferlib.pytorch.layer_init(nn.Linear(8 * 4 * 9, input_size // 2)), + nn.ReLU(), + ) + + self.fc_size = nn.Sequential( + pufferlib.pytorch.layer_init(nn.Linear(9, input_size // 4)), + nn.ReLU(), + ) + + # Projection layer (like Tetris): combine grid and all scalar features + # input_size (grid) + input_size//2 (rows) + input_size//2 (cols) + input_size//4 (size) = 2.25 * input_size + self.proj = nn.Sequential( + pufferlib.pytorch.layer_init(nn.Linear(input_size + input_size // 2 + input_size // 2 + input_size // 4, hidden_size)), + nn.ReLU(), + ) + + # Output heads + self.actor = pufferlib.pytorch.layer_init( + nn.Linear(hidden_size, env.single_action_space.n), std=0.01) + self.value_fn = pufferlib.pytorch.layer_init( + nn.Linear(hidden_size, 1), std=1) + + def forward(self, observations, state=None): + hidden = self.encode_observations(observations) + actions, value = self.decode_actions(hidden) + return actions, value + + def forward_train(self, x, state=None): + return self.forward(x, state) + + def encode_observations(self, observations, state=None): + B = observations.shape[0] + + # Parse observations + grid = F.one_hot(observations[:, :64].view(B, 8, 8).long(), 4).permute(0, 3, 1, 2).float() # (B, 4, 8, 8) + row_clues = F.one_hot(observations[:, 64:96].view(B, 8, 4).long(), 9).float() # (B, 8, 4, 9) + col_clues = F.one_hot(observations[:, 96:128].view(B, 8, 4).long(), 9).float() # (B, 8, 4, 9) + board_size = F.one_hot(observations[:, 128].long(), 9).float() # (B, 9) + + # Process grid through CNN (Tetris-style) + grid_feat = self.conv_grid(grid) # (B, input_size) + + # Process scalar features separately (NO weight sharing) + row_feat = self.fc_row_clues(row_clues.reshape(B, -1)) # (B, input_size//2) + col_feat = self.fc_col_clues(col_clues.reshape(B, -1)) # (B, input_size//2) + size_feat = self.fc_size(board_size) # (B, input_size//4) + + # Combine and project (Tetris-style) + combined = torch.cat([grid_feat, row_feat, col_feat, size_feat], dim=-1) + features = self.proj(combined) # (B, hidden_size) + + return features + + def decode_actions(self, flat_hidden): + action = self.actor(flat_hidden) + value = self.value_fn(flat_hidden) + return action, value + + class Drone(nn.Module): ''' Drone policy. Flattens obs and applies a linear layer. ''' From b5da2cc023cdc4c401004ae3b5059f256dcda9aa Mon Sep 17 00:00:00 2001 From: Eitan Porat Date: Tue, 21 Oct 2025 20:50:23 +0000 Subject: [PATCH 2/5] Register env --- pufferlib/ocean/environment.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pufferlib/ocean/environment.py b/pufferlib/ocean/environment.py index ed1408ba6..b22c5f0ee 100644 --- a/pufferlib/ocean/environment.py +++ b/pufferlib/ocean/environment.py @@ -126,6 +126,7 @@ def make_multiagent(buf=None, **kwargs): 'freeway': 'Freeway', 'enduro': 'Enduro', 'tetris': 'Tetris', + 'nonogram': 'Nonogram', 'cartpole': 'Cartpole', 'moba': 'Moba', 'matsci': 'Matsci', From ba94269c36cacc56829611a480af9c629f06ca6a Mon Sep 17 00:00:00 2001 From: Eitan Porat Date: Tue, 21 Oct 2025 21:13:39 +0000 Subject: [PATCH 3/5] clean up code --- pufferlib/config/ocean/nonogram.ini | 102 +-------------- pufferlib/ocean/nonogram/nonogram.h | 184 +--------------------------- pufferlib/ocean/torch.py | 30 ++--- 3 files changed, 11 insertions(+), 305 deletions(-) diff --git a/pufferlib/config/ocean/nonogram.ini b/pufferlib/config/ocean/nonogram.ini index 0c5d6ff44..7f203644a 100644 --- a/pufferlib/config/ocean/nonogram.ini +++ b/pufferlib/config/ocean/nonogram.ini @@ -2,20 +2,18 @@ package = ocean env_name = puffer_nonogram policy_name = Nonogram -; policy_name = Policy rnn_name = Recurrent [env] num_envs = 4096 min_size = 4 max_size = 8 -easy_learn = 1 +easy_learn = 0 [sweep] metric = score [train] -; Hyperparameters from wandb config name = pufferai seed = 42 gamma = 0.99965 @@ -52,101 +50,3 @@ compile_fullgraph = True max_minibatch_size = 32768 checkpoint_interval = 200 torch_deterministic = True - -; PREVIOUS RUN: Run ID 3k4cpz3ts3keprggvyldpo0bzpo2djsc -; total_timesteps = 1e10 -; minibatch_size = 65536 -; use_rnn = True -; update_epochs = 1 -; bptt_horizon = 64 -; gae_lambda = 0.9860112307817481 -; gamma = 0.9955237802885055 -; clip_coef = 0.3339182687952462 -; vf_coef = 1.3604733057894562 -; vf_clip_coef = 0.1 -; ent_coef = 0.01267345559258322 -; max_grad_norm = 0.7481994494317118 -; learning_rate = 0.0071601604548789605 -; adam_eps = 2.1466958248623007e-10 -; adam_beta1 = 0.9600776540257598 -; adam_beta2 = 0.9987918405974582 -; anneal_lr = True -; optimizer = muon -; prio_alpha = 0.9248668653880601 -; prio_beta0 = 0.9583638692801064 -; vtrace_c_clip = 2.931704492528996 -; vtrace_rho_clip = 1.2830763533710652 - -; PREVIOUS RUN: puffer_sweep_simple_policy -; total_timesteps = 1e10 -; minibatch_size = 16384 -; use_rnn = True -; update_epochs = 1 -; bptt_horizon = 64 -; gae_lambda = 0.9698 -; gamma = 0.9979 -; clip_coef = 0.1896 -; vf_coef = 1.4565 -; vf_clip_coef = 0.2296 -; ent_coef = 0.01257 -; max_grad_norm = 0.4804 -; learning_rate = 0.06449 -; adam_eps = 4.577e-10 -; adam_beta1 = 0.8184 -; adam_beta2 = 0.9996 -; anneal_lr = True -; optimizer = muon -; prio_alpha = 0.8445 -; prio_beta0 = 0.9498 -; vtrace_c_clip = 3.5953 -; vtrace_rho_clip = 2.2273 - -; PREVIOUS RUN: puffer_sweep_black_white_actions -; total_timesteps = 1e10 -; minibatch_size = 32768 -; use_rnn = True -; update_epochs = 1 -; bptt_horizon = 64 -; gae_lambda = 0.8645 -; gamma = 0.9991 -; clip_coef = 0.3043 -; vf_coef = 2.1905 -; vf_clip_coef = 3.1475 -; ent_coef = 0.002274 -; max_grad_norm = 1.0202 -; learning_rate = 0.007169 -; adam_eps = 6.036e-11 -; adam_beta1 = 0.9366 -; adam_beta2 = 0.9985 -; anneal_lr = True -; optimizer = muon -; prio_alpha = 0.8741 -; prio_beta0 = 0.7869 -; vtrace_c_clip = 1.6859 -; vtrace_rho_clip = 1.5254 -; total_timesteps = 10_000_000_000 -; minibatch_size = 32768 -; use_rnn = True -; update_epochs = 1 -; bptt_horizon = 64 -; gae_lambda = 0.6 -; gamma = 0.9999 -; clip_coef = 0.01 -; vf_coef = 4.453 -; vf_clip_coef = 0.1 -; ent_coef = 0.001160 -; max_grad_norm = 1.071 -; learning_rate = 0.003555 -; adam_eps = 1.675e-14 -; adam_beta1 = 0.9817 -; adam_beta2 = 0.9052 -; anneal_lr = True -; optimizer = muon -; compile = False -; precision = float32 -; torch_deterministic = True -; checkpoint_interval = 200 -; prio_alpha = 0.99 -; prio_beta0 = 0.855 -; vtrace_c_clip = 0.7794 -; vtrace_rho_clip = 0.8655 diff --git a/pufferlib/ocean/nonogram/nonogram.h b/pufferlib/ocean/nonogram/nonogram.h index b4060a1de..f41f6244e 100644 --- a/pufferlib/ocean/nonogram/nonogram.h +++ b/pufferlib/ocean/nonogram/nonogram.h @@ -7,15 +7,6 @@ #include #include "raylib.h" -// Debug mode: set to 1 to enable debug output, 0 to disable -#define DEBUG 0 - -#if DEBUG -#define debug_printf(...) printf(__VA_ARGS__) -#else -#define debug_printf(...) ((void)0) -#endif - #define MAX_SIZE 8 #define MAX_CLUES (MAX_SIZE / 2) @@ -33,7 +24,6 @@ const float REWARD_EASY_LEARN_CORRECT = 0.01; const float REWARD_EASY_LEARN_INCORRECT = -0.01; const float REWARD_NO_MATCH = -0.05; -// Required struct for logging typedef struct { float score; float episode_return; @@ -42,7 +32,6 @@ typedef struct { float n; } Log; -// Nonogram environment struct typedef struct { Log log; unsigned char* observations; @@ -50,7 +39,6 @@ typedef struct { float* rewards; unsigned char* terminals; - // Environment state int size; int min_size; int max_size; @@ -60,10 +48,8 @@ typedef struct { int target_total; int easy_learn; - // Solution (for generating clues) unsigned char solution[MAX_SIZE * MAX_SIZE]; - // Clues unsigned char rows_clues[MAX_SIZE * MAX_CLUES]; unsigned char cols_clues[MAX_SIZE * MAX_CLUES]; unsigned char rows_num_runs[MAX_SIZE]; @@ -73,19 +59,15 @@ typedef struct { unsigned char rows_max_clue[MAX_SIZE]; unsigned char cols_max_clue[MAX_SIZE]; - // Current totals unsigned char rows_totals[MAX_SIZE]; unsigned char cols_totals[MAX_SIZE]; - // Track completed lines unsigned char rows_completed[MAX_SIZE]; unsigned char cols_completed[MAX_SIZE]; - // Episode reward accumulator float episode_reward; } Nonogram; -// Helper function implementations void add_log(Nonogram* env) { env->log.score += env->rewards[0]; env->log.episode_length += env->steps_taken; @@ -98,37 +80,21 @@ int get_row_run_length(Nonogram* env, int row, int col) { int row_start = row * MAX_SIZE; int run_length = 1; - debug_printf(" get_row_run_length: row=%d, col=%d, row_start=%d\n", row, col, row_start); - debug_printf(" Row cells before marking: "); - for (int c = 0; c < env->size; c++) { - debug_printf("%d ", env->observations[row_start + c]); - } - debug_printf("\n"); - - // Count left - int left_count = 0; for (int c = col - 1; c >= 0; c--) { if (env->observations[row_start + c] == CELL_BLACK) { run_length++; - left_count++; } else { break; } } - debug_printf(" Left count: %d\n", left_count); - // Count right - int right_count = 0; for (int c = col + 1; c < env->size; c++) { if (env->observations[row_start + c] == CELL_BLACK) { run_length++; - right_count++; } else { break; } } - debug_printf(" Right count: %d\n", right_count); - debug_printf(" Total run_length (1 + left + right): %d\n", run_length); return run_length; } @@ -136,67 +102,35 @@ int get_row_run_length(Nonogram* env, int row, int col) { int get_col_run_length(Nonogram* env, int row, int col) { int run_length = 1; - debug_printf(" get_col_run_length: row=%d, col=%d\n", row, col); - debug_printf(" Col cells before marking: "); - for (int r = 0; r < env->size; r++) { - debug_printf("%d ", env->observations[r * MAX_SIZE + col]); - } - debug_printf("\n"); - - // Count up - int up_count = 0; for (int r = row - 1; r >= 0; r--) { if (env->observations[r * MAX_SIZE + col] == CELL_BLACK) { run_length++; - up_count++; } else { break; } } - debug_printf(" Up count: %d\n", up_count); - // Count down - int down_count = 0; for (int r = row + 1; r < env->size; r++) { if (env->observations[r * MAX_SIZE + col] == CELL_BLACK) { run_length++; - down_count++; } else { break; } } - debug_printf(" Down count: %d\n", down_count); - debug_printf(" Total run_length (1 + up + down): %d\n", run_length); return run_length; } int check_line_matches(unsigned char* line_data, unsigned char* clues, int num_runs, int size) { - debug_printf(" check_line_matches: num_runs=%d, size=%d\n", num_runs, size); - debug_printf(" Line data: "); - for (int i = 0; i < size; i++) { - debug_printf("%d ", line_data[i]); - } - debug_printf("\n"); - debug_printf(" Expected clues: "); - for (int i = 0; i < num_runs; i++) { - debug_printf("%d ", clues[i]); - } - debug_printf("\n"); - int run_idx = 0; int count = 0; for (int i = 0; i < size; i++) { if (line_data[i] == CELL_BLACK) { count++; - debug_printf(" Position %d: BLACK, count=%d\n", i, count); } else if (line_data[i] == CELL_EMPTY || line_data[i] == CELL_WHITE) { if (count > 0) { - debug_printf(" End of run at position %d: count=%d, expected=%d (run_idx=%d)\n", - i, count, clues[run_idx], run_idx); if (clues[run_idx] != count) { - debug_printf(" MISMATCH! Expected %d but got %d\n", clues[run_idx], count); return 0; } run_idx++; @@ -205,28 +139,20 @@ int check_line_matches(unsigned char* line_data, unsigned char* clues, int num_r } } - // Check final run if (count > 0) { - debug_printf(" Final run: count=%d, expected=%d (run_idx=%d)\n", count, clues[run_idx], run_idx); if (clues[run_idx] != count) { - debug_printf(" FINAL MISMATCH! Expected %d but got %d\n", clues[run_idx], count); return 0; } run_idx++; } - debug_printf(" Total runs found: %d, expected: %d\n", run_idx, num_runs); - int matches = (run_idx == num_runs); - debug_printf(" Pattern matches: %d\n", matches); - return matches; + return (run_idx == num_runs); } -// Helper to generate random float in [0, 1] float rand_uniform() { return (float)rand() / (float)RAND_MAX; } -// Required functions void c_reset(Nonogram* env) { env->size = env->min_size + (rand() % (env->max_size - env->min_size + 1)); env->max_steps = env->easy_learn ? env->size * env->size : 4 * env->size * env->size; @@ -234,18 +160,14 @@ void c_reset(Nonogram* env) { int full_grid_size = MAX_SIZE * MAX_SIZE; int max_clues = MAX_SIZE / 2; - // Initialize all grid as PADDING, then clear valid cells to EMPTY (using MAX_SIZE stride) memset(env->observations, CELL_PADDING, full_grid_size); for (int r = 0; r < env->size; r++) { for (int c = 0; c < env->size; c++) { env->observations[r * MAX_SIZE + c] = CELL_EMPTY; } } - // Clear clue areas memset(env->observations + full_grid_size, 0, 2 * MAX_SIZE * max_clues); - // Generate random solution using MAX_SIZE stride with uniform fill probability - // Sample fill probability p uniformly from [0, 1] for difficulty variation float fill_prob = rand_uniform(); memset(env->solution, CELL_WHITE, MAX_SIZE * MAX_SIZE); int has_filled = 0; @@ -258,18 +180,15 @@ void c_reset(Nonogram* env) { } } - // Ensure at least one square is set if (!has_filled) { int rand_row = rand() % env->size; int rand_col = rand() % env->size; env->solution[rand_row * MAX_SIZE + rand_col] = CELL_BLACK; } - // Reset clues arrays memset(env->rows_clues, 0, MAX_SIZE * MAX_CLUES); memset(env->cols_clues, 0, MAX_SIZE * MAX_CLUES); - // Calculate row clues for (int i = 0; i < env->size; i++) { int clue_idx = 0; int count = 0; @@ -289,7 +208,6 @@ void c_reset(Nonogram* env) { env->rows_num_runs[i] = clue_idx; } - // Calculate column clues for (int j = 0; j < env->size; j++) { int clue_idx = 0; int count = 0; @@ -309,14 +227,11 @@ void c_reset(Nonogram* env) { env->cols_num_runs[j] = clue_idx; } - // Store clues in observation memcpy(env->observations + full_grid_size, env->rows_clues, MAX_SIZE * max_clues); memcpy(env->observations + full_grid_size + MAX_SIZE * max_clues, env->cols_clues, MAX_SIZE * max_clues); - // Store board size as scalar at end of observation env->observations[full_grid_size + 2 * MAX_SIZE * max_clues] = env->size; - // Calculate max clues and target sums memset(env->rows_totals, 0, MAX_SIZE); memset(env->cols_totals, 0, MAX_SIZE); memset(env->rows_completed, 0, MAX_SIZE); @@ -324,7 +239,6 @@ void c_reset(Nonogram* env) { env->filled_total = 0; for (int i = 0; i < env->size; i++) { - // Find max clue for row int max_clue = 0; int sum = 0; for (int j = 0; j < max_clues; j++) { @@ -337,7 +251,6 @@ void c_reset(Nonogram* env) { env->rows_max_clue[i] = max_clue; env->rows_target_sum[i] = sum; - // Find max clue for col max_clue = 0; sum = 0; for (int j = 0; j < max_clues; j++) { @@ -351,52 +264,11 @@ void c_reset(Nonogram* env) { env->cols_target_sum[i] = sum; } - // Calculate target total env->target_total = 0; for (int i = 0; i < env->size; i++) { env->target_total += env->rows_target_sum[i]; } - // Debug: print solution and clues - debug_printf("\n=== RESET: New puzzle generated (size=%d) ===\n", env->size); - debug_printf("Solution grid:\n"); - for (int r = 0; r < env->size; r++) { - debug_printf(" Row %d: ", r); - for (int c = 0; c < env->size; c++) { - debug_printf("%d ", env->solution[r * MAX_SIZE + c]); - } - debug_printf("\n"); - } - - debug_printf("\nRow clues:\n"); - for (int r = 0; r < env->size; r++) { - debug_printf(" Row %d (num_runs=%d, target_sum=%d, max_clue=%d): ", - r, env->rows_num_runs[r], env->rows_target_sum[r], env->rows_max_clue[r]); - for (int i = 0; i < MAX_CLUES; i++) { - int clue = env->rows_clues[r * MAX_CLUES + i]; - if (clue > 0) { - debug_printf("%d ", clue); - } - } - debug_printf("\n"); - } - - debug_printf("\nColumn clues:\n"); - for (int c = 0; c < env->size; c++) { - debug_printf(" Col %d (num_runs=%d, target_sum=%d, max_clue=%d): ", - c, env->cols_num_runs[c], env->cols_target_sum[c], env->cols_max_clue[c]); - for (int i = 0; i < MAX_CLUES; i++) { - int clue = env->cols_clues[c * MAX_CLUES + i]; - if (clue > 0) { - debug_printf("%d ", clue); - } - } - debug_printf("\n"); - } - - debug_printf("\nTarget total BLACK cells: %d\n", env->target_total); - debug_printf("===================================\n\n"); - env->steps_taken = 0; env->episode_reward = 0; } @@ -409,11 +281,7 @@ void c_step(Nonogram* env) { env->steps_taken++; - debug_printf("DEBUG c_step: action=%d, steps=%d\n", action, env->steps_taken); - - // Check timeout FIRST before any game logic if (env->steps_taken > env->max_steps) { - debug_printf("DEBUG: TIMEOUT\n"); env->terminals[0] = 1; env->rewards[0] = REWARD_TIMEOUT; env->episode_reward += REWARD_TIMEOUT; @@ -422,21 +290,13 @@ void c_step(Nonogram* env) { return; } - // Decode action: 0-63 = mark WHITE, 64-127 = mark BLACK int mark_black = action >= (MAX_SIZE * MAX_SIZE); int pos = action % (MAX_SIZE * MAX_SIZE); - debug_printf("DEBUG: mark_black=%d, pos=%d\n", mark_black, pos); - - // Convert position to row/col using MAX_SIZE stride int row = pos / MAX_SIZE; int col = pos % MAX_SIZE; - debug_printf("DEBUG: row=%d, col=%d, size=%d\n", row, col, env->size); - - // Check if action is out of bounds (hitting padding area) if (row >= env->size || col >= env->size) { - debug_printf("DEBUG: OUT OF BOUNDS (row=%d, col=%d >= size=%d)\n", row, col, env->size); env->terminals[0] = 1; env->rewards[0] = REWARD_OUT_OF_BOUNDS; env->episode_reward += REWARD_OUT_OF_BOUNDS; @@ -447,12 +307,7 @@ void c_step(Nonogram* env) { unsigned char current = env->observations[pos]; - debug_printf("DEBUG: current cell value=%d (EMPTY=%d, WHITE=%d, BLACK=%d, PADDING=%d)\n", - current, CELL_EMPTY, CELL_WHITE, CELL_BLACK, CELL_PADDING); - - // Can't mark a cell that's already been marked if (current != CELL_EMPTY) { - debug_printf("DEBUG: INVALID - cell already marked (current=%d)\n", current); env->terminals[0] = 1; env->rewards[0] = REWARD_INVALID_MOVE; env->episode_reward += REWARD_INVALID_MOVE; @@ -461,19 +316,9 @@ void c_step(Nonogram* env) { return; } - // Mark cell as BLACK or WHITE if (mark_black) { - debug_printf("DEBUG: Marking BLACK\n"); - // Marking BLACK - check if valid - // First check: totals equal target - invalid move (terminate episode) - debug_printf("DEBUG: rows_totals[%d]=%d, rows_target_sum[%d]=%d\n", - row, env->rows_totals[row], row, env->rows_target_sum[row]); - debug_printf("DEBUG: cols_totals[%d]=%d, cols_target_sum[%d]=%d\n", - col, env->cols_totals[col], col, env->cols_target_sum[col]); - if (env->rows_totals[row] == env->rows_target_sum[row] || env->cols_totals[col] == env->cols_target_sum[col]) { - debug_printf("DEBUG: INVALID - row or col already full\n"); env->terminals[0] = 1; env->rewards[0] = REWARD_INVALID_MOVE; env->episode_reward += REWARD_INVALID_MOVE; @@ -482,13 +327,9 @@ void c_step(Nonogram* env) { return; } - // Check if marking this cell BLACK would create a run longer than max allowed int row_run = get_row_run_length(env, row, col); - debug_printf("DEBUG: row_run_length=%d, rows_max_clue[%d]=%d\n", - row_run, row, env->rows_max_clue[row]); if (row_run > env->rows_max_clue[row]) { - debug_printf("DEBUG: INVALID - row run too long\n"); env->terminals[0] = 1; env->rewards[0] = REWARD_INVALID_MOVE; env->episode_reward += REWARD_INVALID_MOVE; @@ -498,11 +339,8 @@ void c_step(Nonogram* env) { } int col_run = get_col_run_length(env, row, col); - debug_printf("DEBUG: col_run_length=%d, cols_max_clue[%d]=%d\n", - col_run, col, env->cols_max_clue[col]); if (col_run > env->cols_max_clue[col]) { - debug_printf("DEBUG: INVALID - col run too long\n"); env->terminals[0] = 1; env->rewards[0] = REWARD_INVALID_MOVE; env->episode_reward += REWARD_INVALID_MOVE; @@ -511,24 +349,16 @@ void c_step(Nonogram* env) { return; } - // Second check: if completing row/col, check runs match int row_completed = 0; int col_completed = 0; - debug_printf("DEBUG: Checking line completion...\n"); - if (env->rows_totals[row] == env->rows_target_sum[row] - 1) { - debug_printf("DEBUG: Would complete row %d, checking pattern...\n", row); - // Temporarily mark BLACK to check env->observations[pos] = CELL_BLACK; int row_start = row * MAX_SIZE; int matches = check_line_matches(env->observations + row_start, env->rows_clues + row * MAX_CLUES, env->rows_num_runs[row], env->size); - debug_printf("DEBUG: Row pattern matches: %d\n", matches); if (!matches) { - // Runs don't match - invalid move (terminate episode) - debug_printf("DEBUG: INVALID - row pattern doesn't match\n"); env->observations[pos] = CELL_EMPTY; env->terminals[0] = 1; env->rewards[0] = REWARD_NO_MATCH; @@ -542,8 +372,6 @@ void c_step(Nonogram* env) { } if (env->cols_totals[col] == env->cols_target_sum[col] - 1) { - debug_printf("DEBUG: Would complete col %d, checking pattern...\n", col); - // Temporarily mark BLACK to check env->observations[pos] = CELL_BLACK; unsigned char col_data[MAX_SIZE]; for (int i = 0; i < env->size; i++) { @@ -552,10 +380,7 @@ void c_step(Nonogram* env) { int matches = check_line_matches(col_data, env->cols_clues + col * MAX_CLUES, env->cols_num_runs[col], env->size); - debug_printf("DEBUG: Col pattern matches: %d\n", matches); if (!matches) { - // Runs don't match - invalid move (terminate episode) - debug_printf("DEBUG: INVALID - col pattern doesn't match\n"); env->observations[pos] = CELL_EMPTY; env->terminals[0] = 1; env->rewards[0] = REWARD_NO_MATCH; @@ -568,13 +393,11 @@ void c_step(Nonogram* env) { col_completed = 1; } - // Apply mark BLACK env->observations[pos] = CELL_BLACK; env->rows_totals[row]++; env->cols_totals[col]++; env->filled_total++; - // Give reward for newly completed lines only int row_newly_completed = row_completed && !env->rows_completed[row]; int col_newly_completed = col_completed && !env->cols_completed[col]; @@ -585,21 +408,17 @@ void c_step(Nonogram* env) { env->rewards[0] += line_reward; env->episode_reward += line_reward; } else { - // Marking WHITE - always valid (just marks empty as not-black) env->observations[pos] = CELL_WHITE; } - // Easy learn mode: check if cell matches solution if (env->easy_learn) { unsigned char solution_cell = env->solution[pos]; unsigned char actual = env->observations[pos]; if (solution_cell == actual) { - // Correct move: give positive reward and continue env->rewards[0] += REWARD_EASY_LEARN_CORRECT; env->episode_reward += REWARD_EASY_LEARN_CORRECT; } else { - // Incorrect move: give negative reward, terminate and reset env->rewards[0] += REWARD_EASY_LEARN_INCORRECT; env->episode_reward += REWARD_EASY_LEARN_INCORRECT; env->terminals[0] = 1; @@ -609,7 +428,6 @@ void c_step(Nonogram* env) { } } - // Check if solved (filled_total == target_total means all BLACK cells placed correctly) if (env->filled_total == env->target_total) { env->terminals[0] = 1; env->rewards[0] = REWARD_WIN; diff --git a/pufferlib/ocean/torch.py b/pufferlib/ocean/torch.py index 8da1a5935..726e9c6b9 100644 --- a/pufferlib/ocean/torch.py +++ b/pufferlib/ocean/torch.py @@ -953,9 +953,6 @@ def __init__(self, env, cnn_channels=32, input_size=128, hidden_size=128, **kwar self.hidden_size = hidden_size self.is_continuous = False - # Tetris-style architecture: multi-layer CNN for grid + separate scalar encoder - - # Grid CNN (like Tetris): multiple conv layers with strides self.conv_grid = nn.Sequential( pufferlib.pytorch.layer_init(nn.Conv2d(4, cnn_channels, kernel_size=3, stride=1, padding=1)), nn.ReLU(), @@ -967,7 +964,6 @@ def __init__(self, env, cnn_channels=32, input_size=128, hidden_size=128, **kwar pufferlib.pytorch.layer_init(nn.Linear(cnn_channels * 2 * 2, input_size)), ) - # Separate encoders for row clues, column clues, and size (NO weight sharing) self.fc_row_clues = nn.Sequential( pufferlib.pytorch.layer_init(nn.Linear(8 * 4 * 9, input_size // 2)), nn.ReLU(), @@ -983,14 +979,11 @@ def __init__(self, env, cnn_channels=32, input_size=128, hidden_size=128, **kwar nn.ReLU(), ) - # Projection layer (like Tetris): combine grid and all scalar features - # input_size (grid) + input_size//2 (rows) + input_size//2 (cols) + input_size//4 (size) = 2.25 * input_size self.proj = nn.Sequential( pufferlib.pytorch.layer_init(nn.Linear(input_size + input_size // 2 + input_size // 2 + input_size // 4, hidden_size)), nn.ReLU(), ) - # Output heads self.actor = pufferlib.pytorch.layer_init( nn.Linear(hidden_size, env.single_action_space.n), std=0.01) self.value_fn = pufferlib.pytorch.layer_init( @@ -1007,23 +1000,18 @@ def forward_train(self, x, state=None): def encode_observations(self, observations, state=None): B = observations.shape[0] - # Parse observations - grid = F.one_hot(observations[:, :64].view(B, 8, 8).long(), 4).permute(0, 3, 1, 2).float() # (B, 4, 8, 8) - row_clues = F.one_hot(observations[:, 64:96].view(B, 8, 4).long(), 9).float() # (B, 8, 4, 9) - col_clues = F.one_hot(observations[:, 96:128].view(B, 8, 4).long(), 9).float() # (B, 8, 4, 9) - board_size = F.one_hot(observations[:, 128].long(), 9).float() # (B, 9) - - # Process grid through CNN (Tetris-style) - grid_feat = self.conv_grid(grid) # (B, input_size) + grid = F.one_hot(observations[:, :64].view(B, 8, 8).long(), 4).permute(0, 3, 1, 2).float() + row_clues = F.one_hot(observations[:, 64:96].view(B, 8, 4).long(), 9).float() + col_clues = F.one_hot(observations[:, 96:128].view(B, 8, 4).long(), 9).float() + board_size = F.one_hot(observations[:, 128].long(), 9).float() - # Process scalar features separately (NO weight sharing) - row_feat = self.fc_row_clues(row_clues.reshape(B, -1)) # (B, input_size//2) - col_feat = self.fc_col_clues(col_clues.reshape(B, -1)) # (B, input_size//2) - size_feat = self.fc_size(board_size) # (B, input_size//4) + grid_feat = self.conv_grid(grid) + row_feat = self.fc_row_clues(row_clues.reshape(B, -1)) + col_feat = self.fc_col_clues(col_clues.reshape(B, -1)) + size_feat = self.fc_size(board_size) - # Combine and project (Tetris-style) combined = torch.cat([grid_feat, row_feat, col_feat, size_feat], dim=-1) - features = self.proj(combined) # (B, hidden_size) + features = self.proj(combined) return features From 565ea9f4d0758c5b5b599a20d1fa91eb5d65b3d9 Mon Sep 17 00:00:00 2001 From: Eitan Porat Date: Tue, 21 Oct 2025 21:22:21 +0000 Subject: [PATCH 4/5] format code --- pufferlib/ocean/nonogram/binding.c | 107 +-- pufferlib/ocean/nonogram/nonogram.c | 38 +- pufferlib/ocean/nonogram/nonogram.h | 989 ++++++++++++++-------------- 3 files changed, 574 insertions(+), 560 deletions(-) diff --git a/pufferlib/ocean/nonogram/binding.c b/pufferlib/ocean/nonogram/binding.c index 8f63c9d9b..910f5e90f 100644 --- a/pufferlib/ocean/nonogram/binding.c +++ b/pufferlib/ocean/nonogram/binding.c @@ -1,75 +1,76 @@ -#include #include "nonogram.h" +#include // Forward declare custom methods -static PyObject* vec_get_solutions(PyObject* self, PyObject* args); -static PyObject* vec_get_size(PyObject* self, PyObject* args); +static PyObject *vec_get_solutions(PyObject *self, PyObject *args); +static PyObject *vec_get_size(PyObject *self, PyObject *args); #define Env Nonogram -#define MY_METHODS \ - {"vec_get_solutions", vec_get_solutions, METH_VARARGS, "Get solutions from all environments"}, \ - {"vec_get_size", vec_get_size, METH_VARARGS, "Get current board size"} +#define MY_METHODS \ + {"vec_get_solutions", vec_get_solutions, METH_VARARGS, \ + "Get solutions from all environments"}, \ + {"vec_get_size", vec_get_size, METH_VARARGS, "Get current board size"} #include "../env_binding.h" -static int my_init(Env* env, PyObject* args, PyObject* kwargs) { - env->min_size = unpack(kwargs, "min_size"); - env->max_size = unpack(kwargs, "max_size"); - env->easy_learn = unpack(kwargs, "easy_learn"); - env->size = env->max_size; - env->max_steps = 4 * env->max_size * env->max_size; - return 0; +static int my_init(Env *env, PyObject *args, PyObject *kwargs) { + env->min_size = unpack(kwargs, "min_size"); + env->max_size = unpack(kwargs, "max_size"); + env->easy_learn = unpack(kwargs, "easy_learn"); + env->size = env->max_size; + env->max_steps = 4 * env->max_size * env->max_size; + return 0; } -static int my_log(PyObject* dict, Log* log) { - assign_to_dict(dict, "score", log->score); - assign_to_dict(dict, "episode_return", log->episode_return); - assign_to_dict(dict, "episode_length", log->episode_length); - assign_to_dict(dict, "solved", log->solved); - return 0; +static int my_log(PyObject *dict, Log *log) { + assign_to_dict(dict, "score", log->score); + assign_to_dict(dict, "episode_return", log->episode_return); + assign_to_dict(dict, "episode_length", log->episode_length); + assign_to_dict(dict, "solved", log->solved); + return 0; } // Custom method to get solutions from all environments -static PyObject* vec_get_solutions(PyObject* self, PyObject* args) { - if (PyTuple_Size(args) != 2) { - PyErr_SetString(PyExc_TypeError, "vec_get_solutions requires 2 arguments"); - return NULL; - } +static PyObject *vec_get_solutions(PyObject *self, PyObject *args) { + if (PyTuple_Size(args) != 2) { + PyErr_SetString(PyExc_TypeError, "vec_get_solutions requires 2 arguments"); + return NULL; + } - VecEnv* vec = unpack_vecenv(args); - if (!vec) { - return NULL; - } + VecEnv *vec = unpack_vecenv(args); + if (!vec) { + return NULL; + } - PyObject* solutions_obj = PyTuple_GetItem(args, 1); - if (!PyObject_TypeCheck(solutions_obj, &PyArray_Type)) { - PyErr_SetString(PyExc_TypeError, "solutions must be a NumPy array"); - return NULL; - } - PyArrayObject* solutions = (PyArrayObject*)solutions_obj; - if (!PyArray_ISCONTIGUOUS(solutions)) { - PyErr_SetString(PyExc_ValueError, "solutions must be contiguous"); - return NULL; - } + PyObject *solutions_obj = PyTuple_GetItem(args, 1); + if (!PyObject_TypeCheck(solutions_obj, &PyArray_Type)) { + PyErr_SetString(PyExc_TypeError, "solutions must be a NumPy array"); + return NULL; + } + PyArrayObject *solutions = (PyArrayObject *)solutions_obj; + if (!PyArray_ISCONTIGUOUS(solutions)) { + PyErr_SetString(PyExc_ValueError, "solutions must be contiguous"); + return NULL; + } - // Copy solutions from each environment (always use max_size for buffer) - unsigned char* sol_ptr = PyArray_DATA(solutions); - int max_grid_size = MAX_SIZE * MAX_SIZE; - for (int i = 0; i < vec->num_envs; i++) { - Nonogram* env = vec->envs[i]; - memcpy(sol_ptr + i * max_grid_size, env->solution, max_grid_size); - } + // Copy solutions from each environment (always use max_size for buffer) + unsigned char *sol_ptr = PyArray_DATA(solutions); + int max_grid_size = MAX_SIZE * MAX_SIZE; + for (int i = 0; i < vec->num_envs; i++) { + Nonogram *env = vec->envs[i]; + memcpy(sol_ptr + i * max_grid_size, env->solution, max_grid_size); + } - Py_RETURN_NONE; + Py_RETURN_NONE; } // Get current board size from first environment -static PyObject* vec_get_size(PyObject* self, PyObject* args) { - VecEnv* vec = unpack_vecenv(args); - if (!vec) { - return NULL; - } +static PyObject *vec_get_size(PyObject *self, PyObject *args) { + VecEnv *vec = unpack_vecenv(args); + if (!vec) { + return NULL; + } - Nonogram* env = vec->envs[0]; - return PyLong_FromLong(env->size); + Nonogram *env = vec->envs[0]; + return PyLong_FromLong(env->size); } diff --git a/pufferlib/ocean/nonogram/nonogram.c b/pufferlib/ocean/nonogram/nonogram.c index 953be3dd1..91e07bbd4 100644 --- a/pufferlib/ocean/nonogram/nonogram.c +++ b/pufferlib/ocean/nonogram/nonogram.c @@ -6,27 +6,27 @@ #include "nonogram.h" int main() { - Nonogram env = {.size = 8}; - int max_clues = env.size / 2; - int obs_size = env.size * env.size + 2 * env.size * max_clues; + Nonogram env = {.size = 8}; + int max_clues = env.size / 2; + int obs_size = env.size * env.size + 2 * env.size * max_clues; - env.max_steps = 4 * env.size * env.size; - env.observations = (unsigned char*)calloc(obs_size, sizeof(unsigned char)); - env.actions = (int*)calloc(1, sizeof(int)); - env.rewards = (float*)calloc(1, sizeof(float)); - env.terminals = (unsigned char*)calloc(1, sizeof(unsigned char)); + env.max_steps = 4 * env.size * env.size; + env.observations = (unsigned char *)calloc(obs_size, sizeof(unsigned char)); + env.actions = (int *)calloc(1, sizeof(int)); + env.rewards = (float *)calloc(1, sizeof(float)); + env.terminals = (unsigned char *)calloc(1, sizeof(unsigned char)); - c_reset(&env); + c_reset(&env); + c_render(&env); + while (!WindowShouldClose()) { + env.actions[0] = rand() % (env.size * env.size); + c_step(&env); c_render(&env); - while (!WindowShouldClose()) { - env.actions[0] = rand() % (env.size * env.size); - c_step(&env); - c_render(&env); - } + } - free(env.observations); - free(env.actions); - free(env.rewards); - free(env.terminals); - c_close(&env); + free(env.observations); + free(env.actions); + free(env.rewards); + free(env.terminals); + c_close(&env); } diff --git a/pufferlib/ocean/nonogram/nonogram.h b/pufferlib/ocean/nonogram/nonogram.h index f41f6244e..7ef7d9ed9 100644 --- a/pufferlib/ocean/nonogram/nonogram.h +++ b/pufferlib/ocean/nonogram/nonogram.h @@ -2,10 +2,10 @@ * Players fill cells based on row and column clues (run-length encoding) */ +#include "raylib.h" #include #include #include -#include "raylib.h" #define MAX_SIZE 8 #define MAX_CLUES (MAX_SIZE / 2) @@ -25,569 +25,582 @@ const float REWARD_EASY_LEARN_INCORRECT = -0.01; const float REWARD_NO_MATCH = -0.05; typedef struct { - float score; - float episode_return; - float episode_length; - float solved; - float n; + float score; + float episode_return; + float episode_length; + float solved; + float n; } Log; typedef struct { - Log log; - unsigned char* observations; - int* actions; - float* rewards; - unsigned char* terminals; - - int size; - int min_size; - int max_size; - int max_steps; - int steps_taken; - int filled_total; - int target_total; - int easy_learn; - - unsigned char solution[MAX_SIZE * MAX_SIZE]; - - unsigned char rows_clues[MAX_SIZE * MAX_CLUES]; - unsigned char cols_clues[MAX_SIZE * MAX_CLUES]; - unsigned char rows_num_runs[MAX_SIZE]; - unsigned char cols_num_runs[MAX_SIZE]; - unsigned char rows_target_sum[MAX_SIZE]; - unsigned char cols_target_sum[MAX_SIZE]; - unsigned char rows_max_clue[MAX_SIZE]; - unsigned char cols_max_clue[MAX_SIZE]; - - unsigned char rows_totals[MAX_SIZE]; - unsigned char cols_totals[MAX_SIZE]; - - unsigned char rows_completed[MAX_SIZE]; - unsigned char cols_completed[MAX_SIZE]; - - float episode_reward; + Log log; + unsigned char *observations; + int *actions; + float *rewards; + unsigned char *terminals; + + int size; + int min_size; + int max_size; + int max_steps; + int steps_taken; + int filled_total; + int target_total; + int easy_learn; + + unsigned char solution[MAX_SIZE * MAX_SIZE]; + + unsigned char rows_clues[MAX_SIZE * MAX_CLUES]; + unsigned char cols_clues[MAX_SIZE * MAX_CLUES]; + unsigned char rows_num_runs[MAX_SIZE]; + unsigned char cols_num_runs[MAX_SIZE]; + unsigned char rows_target_sum[MAX_SIZE]; + unsigned char cols_target_sum[MAX_SIZE]; + unsigned char rows_max_clue[MAX_SIZE]; + unsigned char cols_max_clue[MAX_SIZE]; + + unsigned char rows_totals[MAX_SIZE]; + unsigned char cols_totals[MAX_SIZE]; + + unsigned char rows_completed[MAX_SIZE]; + unsigned char cols_completed[MAX_SIZE]; + + float episode_reward; } Nonogram; -void add_log(Nonogram* env) { - env->log.score += env->rewards[0]; - env->log.episode_length += env->steps_taken; - env->log.episode_return += env->episode_reward; - env->log.solved += (env->rewards[0] > 0) ? 1 : 0; - env->log.n++; +void add_log(Nonogram *env) { + env->log.score += env->rewards[0]; + env->log.episode_length += env->steps_taken; + env->log.episode_return += env->episode_reward; + env->log.solved += (env->rewards[0] > 0) ? 1 : 0; + env->log.n++; } -int get_row_run_length(Nonogram* env, int row, int col) { - int row_start = row * MAX_SIZE; - int run_length = 1; +int get_row_run_length(Nonogram *env, int row, int col) { + int row_start = row * MAX_SIZE; + int run_length = 1; - for (int c = col - 1; c >= 0; c--) { - if (env->observations[row_start + c] == CELL_BLACK) { - run_length++; - } else { - break; - } + for (int c = col - 1; c >= 0; c--) { + if (env->observations[row_start + c] == CELL_BLACK) { + run_length++; + } else { + break; } + } - for (int c = col + 1; c < env->size; c++) { - if (env->observations[row_start + c] == CELL_BLACK) { - run_length++; - } else { - break; - } + for (int c = col + 1; c < env->size; c++) { + if (env->observations[row_start + c] == CELL_BLACK) { + run_length++; + } else { + break; } + } - return run_length; + return run_length; } -int get_col_run_length(Nonogram* env, int row, int col) { - int run_length = 1; +int get_col_run_length(Nonogram *env, int row, int col) { + int run_length = 1; - for (int r = row - 1; r >= 0; r--) { - if (env->observations[r * MAX_SIZE + col] == CELL_BLACK) { - run_length++; - } else { - break; - } + for (int r = row - 1; r >= 0; r--) { + if (env->observations[r * MAX_SIZE + col] == CELL_BLACK) { + run_length++; + } else { + break; } + } - for (int r = row + 1; r < env->size; r++) { - if (env->observations[r * MAX_SIZE + col] == CELL_BLACK) { - run_length++; - } else { - break; - } + for (int r = row + 1; r < env->size; r++) { + if (env->observations[r * MAX_SIZE + col] == CELL_BLACK) { + run_length++; + } else { + break; } + } - return run_length; + return run_length; } -int check_line_matches(unsigned char* line_data, unsigned char* clues, int num_runs, int size) { - int run_idx = 0; - int count = 0; - - for (int i = 0; i < size; i++) { - if (line_data[i] == CELL_BLACK) { - count++; - } else if (line_data[i] == CELL_EMPTY || line_data[i] == CELL_WHITE) { - if (count > 0) { - if (clues[run_idx] != count) { - return 0; - } - run_idx++; - count = 0; - } - } - } +int check_line_matches(unsigned char *line_data, unsigned char *clues, + int num_runs, int size) { + int run_idx = 0; + int count = 0; - if (count > 0) { + for (int i = 0; i < size; i++) { + if (line_data[i] == CELL_BLACK) { + count++; + } else if (line_data[i] == CELL_EMPTY || line_data[i] == CELL_WHITE) { + if (count > 0) { if (clues[run_idx] != count) { - return 0; + return 0; } run_idx++; + count = 0; + } } + } - return (run_idx == num_runs); -} + if (count > 0) { + if (clues[run_idx] != count) { + return 0; + } + run_idx++; + } -float rand_uniform() { - return (float)rand() / (float)RAND_MAX; + return (run_idx == num_runs); } -void c_reset(Nonogram* env) { - env->size = env->min_size + (rand() % (env->max_size - env->min_size + 1)); - env->max_steps = env->easy_learn ? env->size * env->size : 4 * env->size * env->size; +float rand_uniform() { return (float)rand() / (float)RAND_MAX; } - int full_grid_size = MAX_SIZE * MAX_SIZE; - int max_clues = MAX_SIZE / 2; +void c_reset(Nonogram *env) { + env->size = env->min_size + (rand() % (env->max_size - env->min_size + 1)); + env->max_steps = + env->easy_learn ? env->size * env->size : 4 * env->size * env->size; - memset(env->observations, CELL_PADDING, full_grid_size); - for (int r = 0; r < env->size; r++) { - for (int c = 0; c < env->size; c++) { - env->observations[r * MAX_SIZE + c] = CELL_EMPTY; - } - } - memset(env->observations + full_grid_size, 0, 2 * MAX_SIZE * max_clues); + int full_grid_size = MAX_SIZE * MAX_SIZE; + int max_clues = MAX_SIZE / 2; - float fill_prob = rand_uniform(); - memset(env->solution, CELL_WHITE, MAX_SIZE * MAX_SIZE); - int has_filled = 0; - for (int i = 0; i < env->size; i++) { - for (int j = 0; j < env->size; j++) { - if (rand_uniform() < fill_prob) { - env->solution[i * MAX_SIZE + j] = CELL_BLACK; - has_filled = 1; - } - } + memset(env->observations, CELL_PADDING, full_grid_size); + for (int r = 0; r < env->size; r++) { + for (int c = 0; c < env->size; c++) { + env->observations[r * MAX_SIZE + c] = CELL_EMPTY; } + } + memset(env->observations + full_grid_size, 0, 2 * MAX_SIZE * max_clues); - if (!has_filled) { - int rand_row = rand() % env->size; - int rand_col = rand() % env->size; - env->solution[rand_row * MAX_SIZE + rand_col] = CELL_BLACK; + float fill_prob = rand_uniform(); + memset(env->solution, CELL_WHITE, MAX_SIZE * MAX_SIZE); + int has_filled = 0; + for (int i = 0; i < env->size; i++) { + for (int j = 0; j < env->size; j++) { + if (rand_uniform() < fill_prob) { + env->solution[i * MAX_SIZE + j] = CELL_BLACK; + has_filled = 1; + } } + } - memset(env->rows_clues, 0, MAX_SIZE * MAX_CLUES); - memset(env->cols_clues, 0, MAX_SIZE * MAX_CLUES); + if (!has_filled) { + int rand_row = rand() % env->size; + int rand_col = rand() % env->size; + env->solution[rand_row * MAX_SIZE + rand_col] = CELL_BLACK; + } - for (int i = 0; i < env->size; i++) { - int clue_idx = 0; - int count = 0; - for (int j = 0; j < env->size; j++) { - if (env->solution[i * MAX_SIZE + j] == CELL_BLACK) { - count++; - } else if (count > 0) { - env->rows_clues[i * MAX_CLUES + clue_idx] = count; - clue_idx++; - count = 0; - } - } - if (count > 0) { - env->rows_clues[i * MAX_CLUES + clue_idx] = count; - clue_idx++; - } - env->rows_num_runs[i] = clue_idx; - } + memset(env->rows_clues, 0, MAX_SIZE * MAX_CLUES); + memset(env->cols_clues, 0, MAX_SIZE * MAX_CLUES); + for (int i = 0; i < env->size; i++) { + int clue_idx = 0; + int count = 0; for (int j = 0; j < env->size; j++) { - int clue_idx = 0; - int count = 0; - for (int i = 0; i < env->size; i++) { - if (env->solution[i * MAX_SIZE + j] == CELL_BLACK) { - count++; - } else if (count > 0) { - env->cols_clues[j * MAX_CLUES + clue_idx] = count; - clue_idx++; - count = 0; - } - } - if (count > 0) { - env->cols_clues[j * MAX_CLUES + clue_idx] = count; - clue_idx++; - } - env->cols_num_runs[j] = clue_idx; + if (env->solution[i * MAX_SIZE + j] == CELL_BLACK) { + count++; + } else if (count > 0) { + env->rows_clues[i * MAX_CLUES + clue_idx] = count; + clue_idx++; + count = 0; + } } - - memcpy(env->observations + full_grid_size, env->rows_clues, MAX_SIZE * max_clues); - memcpy(env->observations + full_grid_size + MAX_SIZE * max_clues, env->cols_clues, MAX_SIZE * max_clues); - - env->observations[full_grid_size + 2 * MAX_SIZE * max_clues] = env->size; - - memset(env->rows_totals, 0, MAX_SIZE); - memset(env->cols_totals, 0, MAX_SIZE); - memset(env->rows_completed, 0, MAX_SIZE); - memset(env->cols_completed, 0, MAX_SIZE); - env->filled_total = 0; - - for (int i = 0; i < env->size; i++) { - int max_clue = 0; - int sum = 0; - for (int j = 0; j < max_clues; j++) { - int clue = env->rows_clues[i * MAX_CLUES + j]; - if (clue > max_clue) { - max_clue = clue; - } - sum += clue; - } - env->rows_max_clue[i] = max_clue; - env->rows_target_sum[i] = sum; - - max_clue = 0; - sum = 0; - for (int j = 0; j < max_clues; j++) { - int clue = env->cols_clues[i * MAX_CLUES + j]; - if (clue > max_clue) { - max_clue = clue; - } - sum += clue; - } - env->cols_max_clue[i] = max_clue; - env->cols_target_sum[i] = sum; + if (count > 0) { + env->rows_clues[i * MAX_CLUES + clue_idx] = count; + clue_idx++; } + env->rows_num_runs[i] = clue_idx; + } - env->target_total = 0; + for (int j = 0; j < env->size; j++) { + int clue_idx = 0; + int count = 0; for (int i = 0; i < env->size; i++) { - env->target_total += env->rows_target_sum[i]; + if (env->solution[i * MAX_SIZE + j] == CELL_BLACK) { + count++; + } else if (count > 0) { + env->cols_clues[j * MAX_CLUES + clue_idx] = count; + clue_idx++; + count = 0; + } + } + if (count > 0) { + env->cols_clues[j * MAX_CLUES + clue_idx] = count; + clue_idx++; + } + env->cols_num_runs[j] = clue_idx; + } + + memcpy(env->observations + full_grid_size, env->rows_clues, + MAX_SIZE * max_clues); + memcpy(env->observations + full_grid_size + MAX_SIZE * max_clues, + env->cols_clues, MAX_SIZE * max_clues); + + env->observations[full_grid_size + 2 * MAX_SIZE * max_clues] = env->size; + + memset(env->rows_totals, 0, MAX_SIZE); + memset(env->cols_totals, 0, MAX_SIZE); + memset(env->rows_completed, 0, MAX_SIZE); + memset(env->cols_completed, 0, MAX_SIZE); + env->filled_total = 0; + + for (int i = 0; i < env->size; i++) { + int max_clue = 0; + int sum = 0; + for (int j = 0; j < max_clues; j++) { + int clue = env->rows_clues[i * MAX_CLUES + j]; + if (clue > max_clue) { + max_clue = clue; + } + sum += clue; + } + env->rows_max_clue[i] = max_clue; + env->rows_target_sum[i] = sum; + + max_clue = 0; + sum = 0; + for (int j = 0; j < max_clues; j++) { + int clue = env->cols_clues[i * MAX_CLUES + j]; + if (clue > max_clue) { + max_clue = clue; + } + sum += clue; } + env->cols_max_clue[i] = max_clue; + env->cols_target_sum[i] = sum; + } - env->steps_taken = 0; - env->episode_reward = 0; -} + env->target_total = 0; + for (int i = 0; i < env->size; i++) { + env->target_total += env->rows_target_sum[i]; + } -void c_step(Nonogram* env) { - int action = env->actions[0]; + env->steps_taken = 0; + env->episode_reward = 0; +} - env->terminals[0] = 0; - env->rewards[0] = 0; +void c_step(Nonogram *env) { + int action = env->actions[0]; + + env->terminals[0] = 0; + env->rewards[0] = 0; + + env->steps_taken++; + + if (env->steps_taken > env->max_steps) { + env->terminals[0] = 1; + env->rewards[0] = REWARD_TIMEOUT; + env->episode_reward += REWARD_TIMEOUT; + add_log(env); + c_reset(env); + return; + } + + int mark_black = action >= (MAX_SIZE * MAX_SIZE); + int pos = action % (MAX_SIZE * MAX_SIZE); + + int row = pos / MAX_SIZE; + int col = pos % MAX_SIZE; + + if (row >= env->size || col >= env->size) { + env->terminals[0] = 1; + env->rewards[0] = REWARD_OUT_OF_BOUNDS; + env->episode_reward += REWARD_OUT_OF_BOUNDS; + add_log(env); + c_reset(env); + return; + } + + unsigned char current = env->observations[pos]; + + if (current != CELL_EMPTY) { + env->terminals[0] = 1; + env->rewards[0] = REWARD_INVALID_MOVE; + env->episode_reward += REWARD_INVALID_MOVE; + add_log(env); + c_reset(env); + return; + } + + if (mark_black) { + if (env->rows_totals[row] == env->rows_target_sum[row] || + env->cols_totals[col] == env->cols_target_sum[col]) { + env->terminals[0] = 1; + env->rewards[0] = REWARD_INVALID_MOVE; + env->episode_reward += REWARD_INVALID_MOVE; + add_log(env); + c_reset(env); + return; + } - env->steps_taken++; + int row_run = get_row_run_length(env, row, col); - if (env->steps_taken > env->max_steps) { - env->terminals[0] = 1; - env->rewards[0] = REWARD_TIMEOUT; - env->episode_reward += REWARD_TIMEOUT; - add_log(env); - c_reset(env); - return; + if (row_run > env->rows_max_clue[row]) { + env->terminals[0] = 1; + env->rewards[0] = REWARD_INVALID_MOVE; + env->episode_reward += REWARD_INVALID_MOVE; + add_log(env); + c_reset(env); + return; } - int mark_black = action >= (MAX_SIZE * MAX_SIZE); - int pos = action % (MAX_SIZE * MAX_SIZE); + int col_run = get_col_run_length(env, row, col); - int row = pos / MAX_SIZE; - int col = pos % MAX_SIZE; + if (col_run > env->cols_max_clue[col]) { + env->terminals[0] = 1; + env->rewards[0] = REWARD_INVALID_MOVE; + env->episode_reward += REWARD_INVALID_MOVE; + add_log(env); + c_reset(env); + return; + } - if (row >= env->size || col >= env->size) { + int row_completed = 0; + int col_completed = 0; + + if (env->rows_totals[row] == env->rows_target_sum[row] - 1) { + env->observations[pos] = CELL_BLACK; + int row_start = row * MAX_SIZE; + int matches = check_line_matches(env->observations + row_start, + env->rows_clues + row * MAX_CLUES, + env->rows_num_runs[row], env->size); + if (!matches) { + env->observations[pos] = CELL_EMPTY; env->terminals[0] = 1; - env->rewards[0] = REWARD_OUT_OF_BOUNDS; - env->episode_reward += REWARD_OUT_OF_BOUNDS; + env->rewards[0] = REWARD_NO_MATCH; + env->episode_reward += REWARD_NO_MATCH; add_log(env); c_reset(env); return; + } + env->observations[pos] = CELL_EMPTY; + row_completed = 1; } - unsigned char current = env->observations[pos]; - - if (current != CELL_EMPTY) { + if (env->cols_totals[col] == env->cols_target_sum[col] - 1) { + env->observations[pos] = CELL_BLACK; + unsigned char col_data[MAX_SIZE]; + for (int i = 0; i < env->size; i++) { + col_data[i] = env->observations[i * MAX_SIZE + col]; + } + int matches = + check_line_matches(col_data, env->cols_clues + col * MAX_CLUES, + env->cols_num_runs[col], env->size); + if (!matches) { + env->observations[pos] = CELL_EMPTY; env->terminals[0] = 1; - env->rewards[0] = REWARD_INVALID_MOVE; - env->episode_reward += REWARD_INVALID_MOVE; + env->rewards[0] = REWARD_NO_MATCH; + env->episode_reward += REWARD_NO_MATCH; add_log(env); c_reset(env); return; + } + env->observations[pos] = CELL_EMPTY; + col_completed = 1; } - if (mark_black) { - if (env->rows_totals[row] == env->rows_target_sum[row] || - env->cols_totals[col] == env->cols_target_sum[col]) { - env->terminals[0] = 1; - env->rewards[0] = REWARD_INVALID_MOVE; - env->episode_reward += REWARD_INVALID_MOVE; - add_log(env); - c_reset(env); - return; - } - - int row_run = get_row_run_length(env, row, col); - - if (row_run > env->rows_max_clue[row]) { - env->terminals[0] = 1; - env->rewards[0] = REWARD_INVALID_MOVE; - env->episode_reward += REWARD_INVALID_MOVE; - add_log(env); - c_reset(env); - return; - } - - int col_run = get_col_run_length(env, row, col); - - if (col_run > env->cols_max_clue[col]) { - env->terminals[0] = 1; - env->rewards[0] = REWARD_INVALID_MOVE; - env->episode_reward += REWARD_INVALID_MOVE; - add_log(env); - c_reset(env); - return; - } - - int row_completed = 0; - int col_completed = 0; - - if (env->rows_totals[row] == env->rows_target_sum[row] - 1) { - env->observations[pos] = CELL_BLACK; - int row_start = row * MAX_SIZE; - int matches = check_line_matches(env->observations + row_start, - env->rows_clues + row * MAX_CLUES, - env->rows_num_runs[row], env->size); - if (!matches) { - env->observations[pos] = CELL_EMPTY; - env->terminals[0] = 1; - env->rewards[0] = REWARD_NO_MATCH; - env->episode_reward += REWARD_NO_MATCH; - add_log(env); - c_reset(env); - return; - } - env->observations[pos] = CELL_EMPTY; - row_completed = 1; - } - - if (env->cols_totals[col] == env->cols_target_sum[col] - 1) { - env->observations[pos] = CELL_BLACK; - unsigned char col_data[MAX_SIZE]; - for (int i = 0; i < env->size; i++) { - col_data[i] = env->observations[i * MAX_SIZE + col]; - } - int matches = check_line_matches(col_data, - env->cols_clues + col * MAX_CLUES, - env->cols_num_runs[col], env->size); - if (!matches) { - env->observations[pos] = CELL_EMPTY; - env->terminals[0] = 1; - env->rewards[0] = REWARD_NO_MATCH; - env->episode_reward += REWARD_NO_MATCH; - add_log(env); - c_reset(env); - return; - } - env->observations[pos] = CELL_EMPTY; - col_completed = 1; - } - - env->observations[pos] = CELL_BLACK; - env->rows_totals[row]++; - env->cols_totals[col]++; - env->filled_total++; - - int row_newly_completed = row_completed && !env->rows_completed[row]; - int col_newly_completed = col_completed && !env->cols_completed[col]; - - if (row_newly_completed) env->rows_completed[row] = 1; - if (col_newly_completed) env->cols_completed[col] = 1; - - float line_reward = (row_newly_completed + col_newly_completed) * REWARD_COMPLETE_LINE; - env->rewards[0] += line_reward; - env->episode_reward += line_reward; + env->observations[pos] = CELL_BLACK; + env->rows_totals[row]++; + env->cols_totals[col]++; + env->filled_total++; + + int row_newly_completed = row_completed && !env->rows_completed[row]; + int col_newly_completed = col_completed && !env->cols_completed[col]; + + if (row_newly_completed) + env->rows_completed[row] = 1; + if (col_newly_completed) + env->cols_completed[col] = 1; + + float line_reward = + (row_newly_completed + col_newly_completed) * REWARD_COMPLETE_LINE; + env->rewards[0] += line_reward; + env->episode_reward += line_reward; + } else { + env->observations[pos] = CELL_WHITE; + } + + if (env->easy_learn) { + unsigned char solution_cell = env->solution[pos]; + unsigned char actual = env->observations[pos]; + + if (solution_cell == actual) { + env->rewards[0] += REWARD_EASY_LEARN_CORRECT; + env->episode_reward += REWARD_EASY_LEARN_CORRECT; } else { - env->observations[pos] = CELL_WHITE; - } - - if (env->easy_learn) { - unsigned char solution_cell = env->solution[pos]; - unsigned char actual = env->observations[pos]; - - if (solution_cell == actual) { - env->rewards[0] += REWARD_EASY_LEARN_CORRECT; - env->episode_reward += REWARD_EASY_LEARN_CORRECT; - } else { - env->rewards[0] += REWARD_EASY_LEARN_INCORRECT; - env->episode_reward += REWARD_EASY_LEARN_INCORRECT; - env->terminals[0] = 1; - add_log(env); - c_reset(env); - return; - } - } - - if (env->filled_total == env->target_total) { - env->terminals[0] = 1; - env->rewards[0] = REWARD_WIN; - env->episode_reward += REWARD_WIN; - add_log(env); - c_reset(env); - return; + env->rewards[0] += REWARD_EASY_LEARN_INCORRECT; + env->episode_reward += REWARD_EASY_LEARN_INCORRECT; + env->terminals[0] = 1; + add_log(env); + c_reset(env); + return; } + } + + if (env->filled_total == env->target_total) { + env->terminals[0] = 1; + env->rewards[0] = REWARD_WIN; + env->episode_reward += REWARD_WIN; + add_log(env); + c_reset(env); + return; + } } -void c_render(Nonogram* env) { - if (!IsWindowReady()) { - int board_width = 120 + MAX_SIZE * 40; - int board_height = 120 + MAX_SIZE * 40; - int screen_width = board_width * 2 + 60 + 40; - int screen_height = board_height + 140; - InitWindow(screen_width, screen_height, "Nonogram (C)"); - SetTargetFPS(60); +void c_render(Nonogram *env) { + if (!IsWindowReady()) { + int board_width = 120 + MAX_SIZE * 40; + int board_height = 120 + MAX_SIZE * 40; + int screen_width = board_width * 2 + 60 + 40; + int screen_height = board_height + 140; + InitWindow(screen_width, screen_height, "Nonogram (C)"); + SetTargetFPS(60); + } + + if (IsKeyDown(KEY_ESCAPE)) { + exit(0); + } + + BeginDrawing(); + ClearBackground((Color){0, 0, 0, 255}); + + int cell_size = 40; + int clue_area = 120; + int board_spacing = 60; + int font_size = 20; + + // Draw titles + DrawText("CURRENT BOARD", 20, 20, 24, RAYWHITE); + int solution_x = clue_area + env->size * cell_size + board_spacing + 20; + DrawText("SOLUTION", solution_x, 20, 24, RAYWHITE); + + // Draw current board + int offset_x = 20; + int offset_y = 60; + + // Draw column clues for current board + for (int clue_row = 0; clue_row < MAX_CLUES; clue_row++) { + for (int c = 0; c < env->size; c++) { + int clue = env->cols_clues[c * MAX_CLUES + clue_row]; + if (clue > 0) { + char text[4]; + snprintf(text, sizeof(text), "%d", clue); + int x = offset_x + clue_area + c * cell_size + cell_size / 2; + int y = offset_y + clue_row * 20 + 10; + int text_width = MeasureText(text, font_size); + DrawText(text, x - text_width / 2, y, font_size, RAYWHITE); + } } - - if (IsKeyDown(KEY_ESCAPE)) { - exit(0); + } + + // Draw row clues for current board + for (int r = 0; r < env->size; r++) { + int clue_x = offset_x + 10; + for (int clue_idx = 0; clue_idx < MAX_CLUES; clue_idx++) { + int clue = env->rows_clues[r * MAX_CLUES + clue_idx]; + if (clue > 0) { + char text[4]; + snprintf(text, sizeof(text), "%d", clue); + int y = offset_y + clue_area + r * cell_size + cell_size / 2 - + font_size / 2; + DrawText(text, clue_x, y, font_size, RAYWHITE); + clue_x += MeasureText(text, font_size) + 5; + } } - - BeginDrawing(); - ClearBackground((Color){0, 0, 0, 255}); - - int cell_size = 40; - int clue_area = 120; - int board_spacing = 60; - int font_size = 20; - - // Draw titles - DrawText("CURRENT BOARD", 20, 20, 24, RAYWHITE); - int solution_x = clue_area + env->size * cell_size + board_spacing + 20; - DrawText("SOLUTION", solution_x, 20, 24, RAYWHITE); - - // Draw current board - int offset_x = 20; - int offset_y = 60; - - // Draw column clues for current board - for (int clue_row = 0; clue_row < MAX_CLUES; clue_row++) { - for (int c = 0; c < env->size; c++) { - int clue = env->cols_clues[c * MAX_CLUES + clue_row]; - if (clue > 0) { - char text[4]; - snprintf(text, sizeof(text), "%d", clue); - int x = offset_x + clue_area + c * cell_size + cell_size / 2; - int y = offset_y + clue_row * 20 + 10; - int text_width = MeasureText(text, font_size); - DrawText(text, x - text_width / 2, y, font_size, RAYWHITE); - } - } + } + + // Draw current grid + for (int r = 0; r < env->size; r++) { + for (int c = 0; c < env->size; c++) { + int x = offset_x + clue_area + c * cell_size; + int y = offset_y + clue_area + r * cell_size; + int pos = r * MAX_SIZE + c; + + if (env->observations[pos] == CELL_BLACK) { + DrawRectangle(x, y, cell_size, cell_size, + (Color){50, 50, 50, 255}); // Dark gray for BLACK + } else if (env->observations[pos] == CELL_WHITE) { + DrawRectangle(x, y, cell_size, cell_size, + (Color){240, 240, 240, 255}); // Light gray for WHITE + } else { + DrawRectangle(x, y, cell_size, cell_size, + (Color){120, 120, 120, 255}); // Medium gray for EMPTY + } + DrawRectangleLines(x, y, cell_size, cell_size, LIGHTGRAY); } - - // Draw row clues for current board - for (int r = 0; r < env->size; r++) { - int clue_x = offset_x + 10; - for (int clue_idx = 0; clue_idx < MAX_CLUES; clue_idx++) { - int clue = env->rows_clues[r * MAX_CLUES + clue_idx]; - if (clue > 0) { - char text[4]; - snprintf(text, sizeof(text), "%d", clue); - int y = offset_y + clue_area + r * cell_size + cell_size / 2 - font_size / 2; - DrawText(text, clue_x, y, font_size, RAYWHITE); - clue_x += MeasureText(text, font_size) + 5; - } - } + } + + // Draw solution board + offset_x = solution_x; + + // Draw column clues for solution + for (int clue_row = 0; clue_row < MAX_CLUES; clue_row++) { + for (int c = 0; c < env->size; c++) { + int clue = env->cols_clues[c * MAX_CLUES + clue_row]; + if (clue > 0) { + char text[4]; + snprintf(text, sizeof(text), "%d", clue); + int x = offset_x + clue_area + c * cell_size + cell_size / 2; + int y = offset_y + clue_row * 20 + 10; + int text_width = MeasureText(text, font_size); + DrawText(text, x - text_width / 2, y, font_size, RAYWHITE); + } } - - // Draw current grid - for (int r = 0; r < env->size; r++) { - for (int c = 0; c < env->size; c++) { - int x = offset_x + clue_area + c * cell_size; - int y = offset_y + clue_area + r * cell_size; - int pos = r * MAX_SIZE + c; - - if (env->observations[pos] == CELL_BLACK) { - DrawRectangle(x, y, cell_size, cell_size, (Color){50, 50, 50, 255}); // Dark gray for BLACK - } else if (env->observations[pos] == CELL_WHITE) { - DrawRectangle(x, y, cell_size, cell_size, (Color){240, 240, 240, 255}); // Light gray for WHITE - } else { - DrawRectangle(x, y, cell_size, cell_size, (Color){120, 120, 120, 255}); // Medium gray for EMPTY - } - DrawRectangleLines(x, y, cell_size, cell_size, LIGHTGRAY); - } + } + + // Draw row clues for solution + for (int r = 0; r < env->size; r++) { + int clue_x = offset_x + 10; + for (int clue_idx = 0; clue_idx < MAX_CLUES; clue_idx++) { + int clue = env->rows_clues[r * MAX_CLUES + clue_idx]; + if (clue > 0) { + char text[4]; + snprintf(text, sizeof(text), "%d", clue); + int y = offset_y + clue_area + r * cell_size + cell_size / 2 - + font_size / 2; + DrawText(text, clue_x, y, font_size, RAYWHITE); + clue_x += MeasureText(text, font_size) + 5; + } } - - // Draw solution board - offset_x = solution_x; - - // Draw column clues for solution - for (int clue_row = 0; clue_row < MAX_CLUES; clue_row++) { - for (int c = 0; c < env->size; c++) { - int clue = env->cols_clues[c * MAX_CLUES + clue_row]; - if (clue > 0) { - char text[4]; - snprintf(text, sizeof(text), "%d", clue); - int x = offset_x + clue_area + c * cell_size + cell_size / 2; - int y = offset_y + clue_row * 20 + 10; - int text_width = MeasureText(text, font_size); - DrawText(text, x - text_width / 2, y, font_size, RAYWHITE); - } - } + } + + // Draw solution grid + for (int r = 0; r < env->size; r++) { + for (int c = 0; c < env->size; c++) { + int x = offset_x + clue_area + c * cell_size; + int y = offset_y + clue_area + r * cell_size; + int pos = r * MAX_SIZE + c; + + if (env->solution[pos] == CELL_BLACK) { + DrawRectangle(x, y, cell_size, cell_size, GREEN); + } else { + DrawRectangle(x, y, cell_size, cell_size, (Color){200, 200, 200, 255}); + } + DrawRectangleLines(x, y, cell_size, cell_size, LIGHTGRAY); } - - // Draw row clues for solution - for (int r = 0; r < env->size; r++) { - int clue_x = offset_x + 10; - for (int clue_idx = 0; clue_idx < MAX_CLUES; clue_idx++) { - int clue = env->rows_clues[r * MAX_CLUES + clue_idx]; - if (clue > 0) { - char text[4]; - snprintf(text, sizeof(text), "%d", clue); - int y = offset_y + clue_area + r * cell_size + cell_size / 2 - font_size / 2; - DrawText(text, clue_x, y, font_size, RAYWHITE); - clue_x += MeasureText(text, font_size) + 5; - } - } - } - - // Draw solution grid - for (int r = 0; r < env->size; r++) { - for (int c = 0; c < env->size; c++) { - int x = offset_x + clue_area + c * cell_size; - int y = offset_y + clue_area + r * cell_size; - int pos = r * MAX_SIZE + c; - - if (env->solution[pos] == CELL_BLACK) { - DrawRectangle(x, y, cell_size, cell_size, GREEN); - } else { - DrawRectangle(x, y, cell_size, cell_size, (Color){200, 200, 200, 255}); - } - DrawRectangleLines(x, y, cell_size, cell_size, LIGHTGRAY); - } - } - - // Draw status - int board_height = clue_area + env->size * cell_size; - int status_y = board_height + 80; - char status[128]; - snprintf(status, sizeof(status), "Steps: %d/%d | Filled: %d/%d | Size: %dx%d", - env->steps_taken, env->max_steps, env->filled_total, env->target_total, env->size, env->size); - DrawText(status, 20, status_y, 20, RAYWHITE); - - // Draw reward info - char reward_info[128]; - snprintf(reward_info, sizeof(reward_info), "Last Reward: %.3f | Episode Return: %.3f", - env->rewards[0], env->episode_reward); - DrawText(reward_info, 20, status_y + 25, 20, RAYWHITE); - - // Draw instructions - DrawText("Click cells to toggle | Press R to reset | ESC to quit", 20, status_y + 60, 16, LIGHTGRAY); - - EndDrawing(); + } + + // Draw status + int board_height = clue_area + env->size * cell_size; + int status_y = board_height + 80; + char status[128]; + snprintf(status, sizeof(status), "Steps: %d/%d | Filled: %d/%d | Size: %dx%d", + env->steps_taken, env->max_steps, env->filled_total, + env->target_total, env->size, env->size); + DrawText(status, 20, status_y, 20, RAYWHITE); + + // Draw reward info + char reward_info[128]; + snprintf(reward_info, sizeof(reward_info), + "Last Reward: %.3f | Episode Return: %.3f", env->rewards[0], + env->episode_reward); + DrawText(reward_info, 20, status_y + 25, 20, RAYWHITE); + + // Draw instructions + DrawText("Click cells to toggle | Press R to reset | ESC to quit", 20, + status_y + 60, 16, LIGHTGRAY); + + EndDrawing(); } -void c_close(Nonogram* env) { - if (IsWindowReady()) { - CloseWindow(); - } +void c_close(Nonogram *env) { + if (IsWindowReady()) { + CloseWindow(); + } } From d27638e60f8c499d3ffa08b53bbfb2c880891527 Mon Sep 17 00:00:00 2001 From: Eitan Porat Date: Tue, 21 Oct 2025 22:24:34 +0000 Subject: [PATCH 5/5] minor --- pufferlib/config/ocean/nonogram.ini | 2 +- pufferlib/ocean/nonogram/nonogram.h | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/pufferlib/config/ocean/nonogram.ini b/pufferlib/config/ocean/nonogram.ini index 7f203644a..a503cf3d8 100644 --- a/pufferlib/config/ocean/nonogram.ini +++ b/pufferlib/config/ocean/nonogram.ini @@ -8,7 +8,7 @@ rnn_name = Recurrent num_envs = 4096 min_size = 4 max_size = 8 -easy_learn = 0 +easy_learn = 1 [sweep] metric = score diff --git a/pufferlib/ocean/nonogram/nonogram.h b/pufferlib/ocean/nonogram/nonogram.h index 7ef7d9ed9..e14ced293 100644 --- a/pufferlib/ocean/nonogram/nonogram.h +++ b/pufferlib/ocean/nonogram/nonogram.h @@ -154,8 +154,7 @@ float rand_uniform() { return (float)rand() / (float)RAND_MAX; } void c_reset(Nonogram *env) { env->size = env->min_size + (rand() % (env->max_size - env->min_size + 1)); - env->max_steps = - env->easy_learn ? env->size * env->size : 4 * env->size * env->size; + env->max_steps = env->size * env->size; int full_grid_size = MAX_SIZE * MAX_SIZE; int max_clues = MAX_SIZE / 2;