diff --git a/pufferlib/config/ocean/nonogram.ini b/pufferlib/config/ocean/nonogram.ini new file mode 100644 index 000000000..a503cf3d8 --- /dev/null +++ b/pufferlib/config/ocean/nonogram.ini @@ -0,0 +1,52 @@ +[base] +package = ocean +env_name = puffer_nonogram +policy_name = Nonogram +rnn_name = Recurrent + +[env] +num_envs = 4096 +min_size = 4 +max_size = 8 +easy_learn = 1 + +[sweep] +metric = score + +[train] +name = pufferai +seed = 42 +gamma = 0.99965 +device = cuda +compile = False +project = ablations +use_rnn = True +vf_coef = 2.365 +adam_eps = 1.566e-10 +data_dir = experiments +ent_coef = 0.01554 +anneal_lr = True +clip_coef = 0.1267 +optimizer = muon +precision = float32 +adam_beta1 = 0.7912 +adam_beta2 = 0.999949 +batch_size = auto +gae_lambda = 0.9007 +prio_alpha = 0.7441 +prio_beta0 = 0.7365 +cpu_offload = False +bptt_horizon = 64 +compile_mode = max-autotune-no-cudagraphs +vf_clip_coef = 1.598 +learning_rate = 0.007103 +max_grad_norm = 1.275 +update_epochs = 1 +vtrace_c_clip = 0.8692 +minibatch_size = 32768 +total_timesteps = 2e10 +vtrace_rho_clip = 0.9074 +compile_fullgraph = True +max_minibatch_size = 32768 +checkpoint_interval = 200 +torch_deterministic = True diff --git a/pufferlib/ocean/environment.py b/pufferlib/ocean/environment.py index 93df76506..ea304bd69 100644 --- a/pufferlib/ocean/environment.py +++ b/pufferlib/ocean/environment.py @@ -126,6 +126,7 @@ def make_multiagent(buf=None, **kwargs): 'freeway': 'Freeway', 'enduro': 'Enduro', 'tetris': 'Tetris', + 'nonogram': 'Nonogram', 'cartpole': 'Cartpole', 'moba': 'Moba', 'matsci': 'Matsci', diff --git a/pufferlib/ocean/nonogram/binding.c b/pufferlib/ocean/nonogram/binding.c new file mode 100644 index 000000000..910f5e90f --- /dev/null +++ b/pufferlib/ocean/nonogram/binding.c @@ -0,0 +1,76 @@ +#include "nonogram.h" +#include + +// Forward declare custom methods +static PyObject *vec_get_solutions(PyObject *self, PyObject *args); +static PyObject *vec_get_size(PyObject *self, PyObject *args); + +#define Env Nonogram +#define MY_METHODS \ + {"vec_get_solutions", vec_get_solutions, METH_VARARGS, \ + "Get solutions from all environments"}, \ + {"vec_get_size", vec_get_size, METH_VARARGS, "Get current board size"} + +#include "../env_binding.h" + +static int my_init(Env *env, PyObject *args, PyObject *kwargs) { + env->min_size = unpack(kwargs, "min_size"); + env->max_size = unpack(kwargs, "max_size"); + env->easy_learn = unpack(kwargs, "easy_learn"); + env->size = env->max_size; + env->max_steps = 4 * env->max_size * env->max_size; + return 0; +} + +static int my_log(PyObject *dict, Log *log) { + assign_to_dict(dict, "score", log->score); + assign_to_dict(dict, "episode_return", log->episode_return); + assign_to_dict(dict, "episode_length", log->episode_length); + assign_to_dict(dict, "solved", log->solved); + return 0; +} + +// Custom method to get solutions from all environments +static PyObject *vec_get_solutions(PyObject *self, PyObject *args) { + if (PyTuple_Size(args) != 2) { + PyErr_SetString(PyExc_TypeError, "vec_get_solutions requires 2 arguments"); + return NULL; + } + + VecEnv *vec = unpack_vecenv(args); + if (!vec) { + return NULL; + } + + PyObject *solutions_obj = PyTuple_GetItem(args, 1); + if (!PyObject_TypeCheck(solutions_obj, &PyArray_Type)) { + PyErr_SetString(PyExc_TypeError, "solutions must be a NumPy array"); + return NULL; + } + PyArrayObject *solutions = (PyArrayObject *)solutions_obj; + if (!PyArray_ISCONTIGUOUS(solutions)) { + PyErr_SetString(PyExc_ValueError, "solutions must be contiguous"); + return NULL; + } + + // Copy solutions from each environment (always use max_size for buffer) + unsigned char *sol_ptr = PyArray_DATA(solutions); + int max_grid_size = MAX_SIZE * MAX_SIZE; + for (int i = 0; i < vec->num_envs; i++) { + Nonogram *env = vec->envs[i]; + memcpy(sol_ptr + i * max_grid_size, env->solution, max_grid_size); + } + + Py_RETURN_NONE; +} + +// Get current board size from first environment +static PyObject *vec_get_size(PyObject *self, PyObject *args) { + VecEnv *vec = unpack_vecenv(args); + if (!vec) { + return NULL; + } + + Nonogram *env = vec->envs[0]; + return PyLong_FromLong(env->size); +} diff --git a/pufferlib/ocean/nonogram/nonogram.c b/pufferlib/ocean/nonogram/nonogram.c new file mode 100644 index 000000000..91e07bbd4 --- /dev/null +++ b/pufferlib/ocean/nonogram/nonogram.c @@ -0,0 +1,32 @@ +/* Pure C demo file for Nonogram. Build it with: + * bash scripts/build_ocean.sh nonogram local (debug) + * bash scripts/build_ocean.sh nonogram fast + */ + +#include "nonogram.h" + +int main() { + Nonogram env = {.size = 8}; + int max_clues = env.size / 2; + int obs_size = env.size * env.size + 2 * env.size * max_clues; + + env.max_steps = 4 * env.size * env.size; + env.observations = (unsigned char *)calloc(obs_size, sizeof(unsigned char)); + env.actions = (int *)calloc(1, sizeof(int)); + env.rewards = (float *)calloc(1, sizeof(float)); + env.terminals = (unsigned char *)calloc(1, sizeof(unsigned char)); + + c_reset(&env); + c_render(&env); + while (!WindowShouldClose()) { + env.actions[0] = rand() % (env.size * env.size); + c_step(&env); + c_render(&env); + } + + free(env.observations); + free(env.actions); + free(env.rewards); + free(env.terminals); + c_close(&env); +} diff --git a/pufferlib/ocean/nonogram/nonogram.h b/pufferlib/ocean/nonogram/nonogram.h new file mode 100644 index 000000000..e14ced293 --- /dev/null +++ b/pufferlib/ocean/nonogram/nonogram.h @@ -0,0 +1,605 @@ +/* Nonogram: A logic puzzle environment + * Players fill cells based on row and column clues (run-length encoding) + */ + +#include "raylib.h" +#include +#include +#include + +#define MAX_SIZE 8 +#define MAX_CLUES (MAX_SIZE / 2) + +const unsigned char CELL_EMPTY = 0; +const unsigned char CELL_WHITE = 1; +const unsigned char CELL_BLACK = 2; +const unsigned char CELL_PADDING = 3; + +const float REWARD_WIN = 1.0; +const float REWARD_INVALID_MOVE = -0.2; +const float REWARD_OUT_OF_BOUNDS = -0.2; +const float REWARD_TIMEOUT = -0.1; +const float REWARD_COMPLETE_LINE = 0.02; +const float REWARD_EASY_LEARN_CORRECT = 0.01; +const float REWARD_EASY_LEARN_INCORRECT = -0.01; +const float REWARD_NO_MATCH = -0.05; + +typedef struct { + float score; + float episode_return; + float episode_length; + float solved; + float n; +} Log; + +typedef struct { + Log log; + unsigned char *observations; + int *actions; + float *rewards; + unsigned char *terminals; + + int size; + int min_size; + int max_size; + int max_steps; + int steps_taken; + int filled_total; + int target_total; + int easy_learn; + + unsigned char solution[MAX_SIZE * MAX_SIZE]; + + unsigned char rows_clues[MAX_SIZE * MAX_CLUES]; + unsigned char cols_clues[MAX_SIZE * MAX_CLUES]; + unsigned char rows_num_runs[MAX_SIZE]; + unsigned char cols_num_runs[MAX_SIZE]; + unsigned char rows_target_sum[MAX_SIZE]; + unsigned char cols_target_sum[MAX_SIZE]; + unsigned char rows_max_clue[MAX_SIZE]; + unsigned char cols_max_clue[MAX_SIZE]; + + unsigned char rows_totals[MAX_SIZE]; + unsigned char cols_totals[MAX_SIZE]; + + unsigned char rows_completed[MAX_SIZE]; + unsigned char cols_completed[MAX_SIZE]; + + float episode_reward; +} Nonogram; + +void add_log(Nonogram *env) { + env->log.score += env->rewards[0]; + env->log.episode_length += env->steps_taken; + env->log.episode_return += env->episode_reward; + env->log.solved += (env->rewards[0] > 0) ? 1 : 0; + env->log.n++; +} + +int get_row_run_length(Nonogram *env, int row, int col) { + int row_start = row * MAX_SIZE; + int run_length = 1; + + for (int c = col - 1; c >= 0; c--) { + if (env->observations[row_start + c] == CELL_BLACK) { + run_length++; + } else { + break; + } + } + + for (int c = col + 1; c < env->size; c++) { + if (env->observations[row_start + c] == CELL_BLACK) { + run_length++; + } else { + break; + } + } + + return run_length; +} + +int get_col_run_length(Nonogram *env, int row, int col) { + int run_length = 1; + + for (int r = row - 1; r >= 0; r--) { + if (env->observations[r * MAX_SIZE + col] == CELL_BLACK) { + run_length++; + } else { + break; + } + } + + for (int r = row + 1; r < env->size; r++) { + if (env->observations[r * MAX_SIZE + col] == CELL_BLACK) { + run_length++; + } else { + break; + } + } + + return run_length; +} + +int check_line_matches(unsigned char *line_data, unsigned char *clues, + int num_runs, int size) { + int run_idx = 0; + int count = 0; + + for (int i = 0; i < size; i++) { + if (line_data[i] == CELL_BLACK) { + count++; + } else if (line_data[i] == CELL_EMPTY || line_data[i] == CELL_WHITE) { + if (count > 0) { + if (clues[run_idx] != count) { + return 0; + } + run_idx++; + count = 0; + } + } + } + + if (count > 0) { + if (clues[run_idx] != count) { + return 0; + } + run_idx++; + } + + return (run_idx == num_runs); +} + +float rand_uniform() { return (float)rand() / (float)RAND_MAX; } + +void c_reset(Nonogram *env) { + env->size = env->min_size + (rand() % (env->max_size - env->min_size + 1)); + env->max_steps = env->size * env->size; + + int full_grid_size = MAX_SIZE * MAX_SIZE; + int max_clues = MAX_SIZE / 2; + + memset(env->observations, CELL_PADDING, full_grid_size); + for (int r = 0; r < env->size; r++) { + for (int c = 0; c < env->size; c++) { + env->observations[r * MAX_SIZE + c] = CELL_EMPTY; + } + } + memset(env->observations + full_grid_size, 0, 2 * MAX_SIZE * max_clues); + + float fill_prob = rand_uniform(); + memset(env->solution, CELL_WHITE, MAX_SIZE * MAX_SIZE); + int has_filled = 0; + for (int i = 0; i < env->size; i++) { + for (int j = 0; j < env->size; j++) { + if (rand_uniform() < fill_prob) { + env->solution[i * MAX_SIZE + j] = CELL_BLACK; + has_filled = 1; + } + } + } + + if (!has_filled) { + int rand_row = rand() % env->size; + int rand_col = rand() % env->size; + env->solution[rand_row * MAX_SIZE + rand_col] = CELL_BLACK; + } + + memset(env->rows_clues, 0, MAX_SIZE * MAX_CLUES); + memset(env->cols_clues, 0, MAX_SIZE * MAX_CLUES); + + for (int i = 0; i < env->size; i++) { + int clue_idx = 0; + int count = 0; + for (int j = 0; j < env->size; j++) { + if (env->solution[i * MAX_SIZE + j] == CELL_BLACK) { + count++; + } else if (count > 0) { + env->rows_clues[i * MAX_CLUES + clue_idx] = count; + clue_idx++; + count = 0; + } + } + if (count > 0) { + env->rows_clues[i * MAX_CLUES + clue_idx] = count; + clue_idx++; + } + env->rows_num_runs[i] = clue_idx; + } + + for (int j = 0; j < env->size; j++) { + int clue_idx = 0; + int count = 0; + for (int i = 0; i < env->size; i++) { + if (env->solution[i * MAX_SIZE + j] == CELL_BLACK) { + count++; + } else if (count > 0) { + env->cols_clues[j * MAX_CLUES + clue_idx] = count; + clue_idx++; + count = 0; + } + } + if (count > 0) { + env->cols_clues[j * MAX_CLUES + clue_idx] = count; + clue_idx++; + } + env->cols_num_runs[j] = clue_idx; + } + + memcpy(env->observations + full_grid_size, env->rows_clues, + MAX_SIZE * max_clues); + memcpy(env->observations + full_grid_size + MAX_SIZE * max_clues, + env->cols_clues, MAX_SIZE * max_clues); + + env->observations[full_grid_size + 2 * MAX_SIZE * max_clues] = env->size; + + memset(env->rows_totals, 0, MAX_SIZE); + memset(env->cols_totals, 0, MAX_SIZE); + memset(env->rows_completed, 0, MAX_SIZE); + memset(env->cols_completed, 0, MAX_SIZE); + env->filled_total = 0; + + for (int i = 0; i < env->size; i++) { + int max_clue = 0; + int sum = 0; + for (int j = 0; j < max_clues; j++) { + int clue = env->rows_clues[i * MAX_CLUES + j]; + if (clue > max_clue) { + max_clue = clue; + } + sum += clue; + } + env->rows_max_clue[i] = max_clue; + env->rows_target_sum[i] = sum; + + max_clue = 0; + sum = 0; + for (int j = 0; j < max_clues; j++) { + int clue = env->cols_clues[i * MAX_CLUES + j]; + if (clue > max_clue) { + max_clue = clue; + } + sum += clue; + } + env->cols_max_clue[i] = max_clue; + env->cols_target_sum[i] = sum; + } + + env->target_total = 0; + for (int i = 0; i < env->size; i++) { + env->target_total += env->rows_target_sum[i]; + } + + env->steps_taken = 0; + env->episode_reward = 0; +} + +void c_step(Nonogram *env) { + int action = env->actions[0]; + + env->terminals[0] = 0; + env->rewards[0] = 0; + + env->steps_taken++; + + if (env->steps_taken > env->max_steps) { + env->terminals[0] = 1; + env->rewards[0] = REWARD_TIMEOUT; + env->episode_reward += REWARD_TIMEOUT; + add_log(env); + c_reset(env); + return; + } + + int mark_black = action >= (MAX_SIZE * MAX_SIZE); + int pos = action % (MAX_SIZE * MAX_SIZE); + + int row = pos / MAX_SIZE; + int col = pos % MAX_SIZE; + + if (row >= env->size || col >= env->size) { + env->terminals[0] = 1; + env->rewards[0] = REWARD_OUT_OF_BOUNDS; + env->episode_reward += REWARD_OUT_OF_BOUNDS; + add_log(env); + c_reset(env); + return; + } + + unsigned char current = env->observations[pos]; + + if (current != CELL_EMPTY) { + env->terminals[0] = 1; + env->rewards[0] = REWARD_INVALID_MOVE; + env->episode_reward += REWARD_INVALID_MOVE; + add_log(env); + c_reset(env); + return; + } + + if (mark_black) { + if (env->rows_totals[row] == env->rows_target_sum[row] || + env->cols_totals[col] == env->cols_target_sum[col]) { + env->terminals[0] = 1; + env->rewards[0] = REWARD_INVALID_MOVE; + env->episode_reward += REWARD_INVALID_MOVE; + add_log(env); + c_reset(env); + return; + } + + int row_run = get_row_run_length(env, row, col); + + if (row_run > env->rows_max_clue[row]) { + env->terminals[0] = 1; + env->rewards[0] = REWARD_INVALID_MOVE; + env->episode_reward += REWARD_INVALID_MOVE; + add_log(env); + c_reset(env); + return; + } + + int col_run = get_col_run_length(env, row, col); + + if (col_run > env->cols_max_clue[col]) { + env->terminals[0] = 1; + env->rewards[0] = REWARD_INVALID_MOVE; + env->episode_reward += REWARD_INVALID_MOVE; + add_log(env); + c_reset(env); + return; + } + + int row_completed = 0; + int col_completed = 0; + + if (env->rows_totals[row] == env->rows_target_sum[row] - 1) { + env->observations[pos] = CELL_BLACK; + int row_start = row * MAX_SIZE; + int matches = check_line_matches(env->observations + row_start, + env->rows_clues + row * MAX_CLUES, + env->rows_num_runs[row], env->size); + if (!matches) { + env->observations[pos] = CELL_EMPTY; + env->terminals[0] = 1; + env->rewards[0] = REWARD_NO_MATCH; + env->episode_reward += REWARD_NO_MATCH; + add_log(env); + c_reset(env); + return; + } + env->observations[pos] = CELL_EMPTY; + row_completed = 1; + } + + if (env->cols_totals[col] == env->cols_target_sum[col] - 1) { + env->observations[pos] = CELL_BLACK; + unsigned char col_data[MAX_SIZE]; + for (int i = 0; i < env->size; i++) { + col_data[i] = env->observations[i * MAX_SIZE + col]; + } + int matches = + check_line_matches(col_data, env->cols_clues + col * MAX_CLUES, + env->cols_num_runs[col], env->size); + if (!matches) { + env->observations[pos] = CELL_EMPTY; + env->terminals[0] = 1; + env->rewards[0] = REWARD_NO_MATCH; + env->episode_reward += REWARD_NO_MATCH; + add_log(env); + c_reset(env); + return; + } + env->observations[pos] = CELL_EMPTY; + col_completed = 1; + } + + env->observations[pos] = CELL_BLACK; + env->rows_totals[row]++; + env->cols_totals[col]++; + env->filled_total++; + + int row_newly_completed = row_completed && !env->rows_completed[row]; + int col_newly_completed = col_completed && !env->cols_completed[col]; + + if (row_newly_completed) + env->rows_completed[row] = 1; + if (col_newly_completed) + env->cols_completed[col] = 1; + + float line_reward = + (row_newly_completed + col_newly_completed) * REWARD_COMPLETE_LINE; + env->rewards[0] += line_reward; + env->episode_reward += line_reward; + } else { + env->observations[pos] = CELL_WHITE; + } + + if (env->easy_learn) { + unsigned char solution_cell = env->solution[pos]; + unsigned char actual = env->observations[pos]; + + if (solution_cell == actual) { + env->rewards[0] += REWARD_EASY_LEARN_CORRECT; + env->episode_reward += REWARD_EASY_LEARN_CORRECT; + } else { + env->rewards[0] += REWARD_EASY_LEARN_INCORRECT; + env->episode_reward += REWARD_EASY_LEARN_INCORRECT; + env->terminals[0] = 1; + add_log(env); + c_reset(env); + return; + } + } + + if (env->filled_total == env->target_total) { + env->terminals[0] = 1; + env->rewards[0] = REWARD_WIN; + env->episode_reward += REWARD_WIN; + add_log(env); + c_reset(env); + return; + } +} + +void c_render(Nonogram *env) { + if (!IsWindowReady()) { + int board_width = 120 + MAX_SIZE * 40; + int board_height = 120 + MAX_SIZE * 40; + int screen_width = board_width * 2 + 60 + 40; + int screen_height = board_height + 140; + InitWindow(screen_width, screen_height, "Nonogram (C)"); + SetTargetFPS(60); + } + + if (IsKeyDown(KEY_ESCAPE)) { + exit(0); + } + + BeginDrawing(); + ClearBackground((Color){0, 0, 0, 255}); + + int cell_size = 40; + int clue_area = 120; + int board_spacing = 60; + int font_size = 20; + + // Draw titles + DrawText("CURRENT BOARD", 20, 20, 24, RAYWHITE); + int solution_x = clue_area + env->size * cell_size + board_spacing + 20; + DrawText("SOLUTION", solution_x, 20, 24, RAYWHITE); + + // Draw current board + int offset_x = 20; + int offset_y = 60; + + // Draw column clues for current board + for (int clue_row = 0; clue_row < MAX_CLUES; clue_row++) { + for (int c = 0; c < env->size; c++) { + int clue = env->cols_clues[c * MAX_CLUES + clue_row]; + if (clue > 0) { + char text[4]; + snprintf(text, sizeof(text), "%d", clue); + int x = offset_x + clue_area + c * cell_size + cell_size / 2; + int y = offset_y + clue_row * 20 + 10; + int text_width = MeasureText(text, font_size); + DrawText(text, x - text_width / 2, y, font_size, RAYWHITE); + } + } + } + + // Draw row clues for current board + for (int r = 0; r < env->size; r++) { + int clue_x = offset_x + 10; + for (int clue_idx = 0; clue_idx < MAX_CLUES; clue_idx++) { + int clue = env->rows_clues[r * MAX_CLUES + clue_idx]; + if (clue > 0) { + char text[4]; + snprintf(text, sizeof(text), "%d", clue); + int y = offset_y + clue_area + r * cell_size + cell_size / 2 - + font_size / 2; + DrawText(text, clue_x, y, font_size, RAYWHITE); + clue_x += MeasureText(text, font_size) + 5; + } + } + } + + // Draw current grid + for (int r = 0; r < env->size; r++) { + for (int c = 0; c < env->size; c++) { + int x = offset_x + clue_area + c * cell_size; + int y = offset_y + clue_area + r * cell_size; + int pos = r * MAX_SIZE + c; + + if (env->observations[pos] == CELL_BLACK) { + DrawRectangle(x, y, cell_size, cell_size, + (Color){50, 50, 50, 255}); // Dark gray for BLACK + } else if (env->observations[pos] == CELL_WHITE) { + DrawRectangle(x, y, cell_size, cell_size, + (Color){240, 240, 240, 255}); // Light gray for WHITE + } else { + DrawRectangle(x, y, cell_size, cell_size, + (Color){120, 120, 120, 255}); // Medium gray for EMPTY + } + DrawRectangleLines(x, y, cell_size, cell_size, LIGHTGRAY); + } + } + + // Draw solution board + offset_x = solution_x; + + // Draw column clues for solution + for (int clue_row = 0; clue_row < MAX_CLUES; clue_row++) { + for (int c = 0; c < env->size; c++) { + int clue = env->cols_clues[c * MAX_CLUES + clue_row]; + if (clue > 0) { + char text[4]; + snprintf(text, sizeof(text), "%d", clue); + int x = offset_x + clue_area + c * cell_size + cell_size / 2; + int y = offset_y + clue_row * 20 + 10; + int text_width = MeasureText(text, font_size); + DrawText(text, x - text_width / 2, y, font_size, RAYWHITE); + } + } + } + + // Draw row clues for solution + for (int r = 0; r < env->size; r++) { + int clue_x = offset_x + 10; + for (int clue_idx = 0; clue_idx < MAX_CLUES; clue_idx++) { + int clue = env->rows_clues[r * MAX_CLUES + clue_idx]; + if (clue > 0) { + char text[4]; + snprintf(text, sizeof(text), "%d", clue); + int y = offset_y + clue_area + r * cell_size + cell_size / 2 - + font_size / 2; + DrawText(text, clue_x, y, font_size, RAYWHITE); + clue_x += MeasureText(text, font_size) + 5; + } + } + } + + // Draw solution grid + for (int r = 0; r < env->size; r++) { + for (int c = 0; c < env->size; c++) { + int x = offset_x + clue_area + c * cell_size; + int y = offset_y + clue_area + r * cell_size; + int pos = r * MAX_SIZE + c; + + if (env->solution[pos] == CELL_BLACK) { + DrawRectangle(x, y, cell_size, cell_size, GREEN); + } else { + DrawRectangle(x, y, cell_size, cell_size, (Color){200, 200, 200, 255}); + } + DrawRectangleLines(x, y, cell_size, cell_size, LIGHTGRAY); + } + } + + // Draw status + int board_height = clue_area + env->size * cell_size; + int status_y = board_height + 80; + char status[128]; + snprintf(status, sizeof(status), "Steps: %d/%d | Filled: %d/%d | Size: %dx%d", + env->steps_taken, env->max_steps, env->filled_total, + env->target_total, env->size, env->size); + DrawText(status, 20, status_y, 20, RAYWHITE); + + // Draw reward info + char reward_info[128]; + snprintf(reward_info, sizeof(reward_info), + "Last Reward: %.3f | Episode Return: %.3f", env->rewards[0], + env->episode_reward); + DrawText(reward_info, 20, status_y + 25, 20, RAYWHITE); + + // Draw instructions + DrawText("Click cells to toggle | Press R to reset | ESC to quit", 20, + status_y + 60, 16, LIGHTGRAY); + + EndDrawing(); +} + +void c_close(Nonogram *env) { + if (IsWindowReady()) { + CloseWindow(); + } +} diff --git a/pufferlib/ocean/nonogram/nonogram.py b/pufferlib/ocean/nonogram/nonogram.py new file mode 100644 index 000000000..c58aa1180 --- /dev/null +++ b/pufferlib/ocean/nonogram/nonogram.py @@ -0,0 +1,85 @@ +'''Nonogram logic puzzle environment''' + +import gymnasium +import numpy as np + +import pufferlib +from pufferlib.ocean.nonogram import binding + +MAX_SIZE = 8 +MIN_SIZE = 4 +MAX_CLUES = MAX_SIZE // 2 +OBS_SIZE = MAX_SIZE * MAX_SIZE + 2 * MAX_SIZE * MAX_CLUES + 1 # +1 for board size + +class Nonogram(pufferlib.PufferEnv): + def __init__(self, num_envs=1, render_mode=None, log_interval=128, + min_size=4, max_size=8, easy_learn=0, buf=None, seed=0): + # Observation space: grid cells (0-3: EMPTY/WHITE/BLACK/PADDING), clues (0-max_size), size encoding (0-1) + # Using max_size as high covers all values + self.single_observation_space = gymnasium.spaces.Box(low=0, high=max_size, + shape=(OBS_SIZE,), dtype=np.uint8) + # Action space: 0-63 = mark WHITE, 64-127 = mark BLACK + self.single_action_space = gymnasium.spaces.Discrete(MAX_SIZE * MAX_SIZE * 2) + self.render_mode = render_mode + self.num_agents = num_envs + self.log_interval = log_interval + + super().__init__(buf) + self.c_envs = binding.vec_init(self.observations, self.actions, self.rewards, + self.terminals, self.truncations, num_envs, seed, + min_size=min_size, max_size=max_size, easy_learn=easy_learn) + + self.solutions = np.zeros((num_envs, max_size * max_size), dtype=np.uint8) + + def reset(self, seed=0): + binding.vec_reset(self.c_envs, seed) + self.tick = 0 + return self.observations, [] + + def step(self, actions): + self.tick += 1 + + self.actions[:] = actions + binding.vec_step(self.c_envs) + + info = [] + if self.tick % self.log_interval == 0: + info.append(binding.vec_log(self.c_envs)) + + return (self.observations, self.rewards, + self.terminals, self.truncations, info) + + def render(self): + binding.vec_render(self.c_envs, 0) + + def close(self): + binding.vec_close(self.c_envs) + + def get_solutions(self): + """Get the solution grids for all environments""" + binding.vec_get_solutions(self.c_envs, self.solutions) + return self.solutions + + def get_size(self): + """Get current board size""" + return binding.vec_get_size(self.c_envs) + +if __name__ == '__main__': + N = 4096 + + env = Nonogram(num_envs=N, min_size=2, max_size=8) + env.reset() + steps = 0 + + CACHE = 1024 + actions = np.random.randint(0, 64, (CACHE, N)) + + i = 0 + import time + start = time.time() + while time.time() - start < 10: + env.step(actions[i % CACHE]) + steps += N + i += 1 + + print('Nonogram SPS:', int(steps / (time.time() - start))) diff --git a/pufferlib/ocean/torch.py b/pufferlib/ocean/torch.py index c414acde2..726e9c6b9 100644 --- a/pufferlib/ocean/torch.py +++ b/pufferlib/ocean/torch.py @@ -942,6 +942,85 @@ def decode_actions(self, hidden): value = self.value_fn(hidden) # (B, 1) return action, value +class NonogramLSTM(pufferlib.models.LSTMWrapper): + def __init__(self, env, policy, input_size=256, hidden_size=256): + super().__init__(env, policy, input_size, hidden_size) + + +class Nonogram(nn.Module): + def __init__(self, env, cnn_channels=32, input_size=128, hidden_size=128, **kwargs): + super().__init__() + self.hidden_size = hidden_size + self.is_continuous = False + + self.conv_grid = nn.Sequential( + pufferlib.pytorch.layer_init(nn.Conv2d(4, cnn_channels, kernel_size=3, stride=1, padding=1)), + nn.ReLU(), + pufferlib.pytorch.layer_init(nn.Conv2d(cnn_channels, cnn_channels, kernel_size=3, stride=2, padding=1)), + nn.ReLU(), + pufferlib.pytorch.layer_init(nn.Conv2d(cnn_channels, cnn_channels, kernel_size=3, stride=2, padding=1)), + nn.ReLU(), + nn.Flatten(), + pufferlib.pytorch.layer_init(nn.Linear(cnn_channels * 2 * 2, input_size)), + ) + + self.fc_row_clues = nn.Sequential( + pufferlib.pytorch.layer_init(nn.Linear(8 * 4 * 9, input_size // 2)), + nn.ReLU(), + ) + + self.fc_col_clues = nn.Sequential( + pufferlib.pytorch.layer_init(nn.Linear(8 * 4 * 9, input_size // 2)), + nn.ReLU(), + ) + + self.fc_size = nn.Sequential( + pufferlib.pytorch.layer_init(nn.Linear(9, input_size // 4)), + nn.ReLU(), + ) + + self.proj = nn.Sequential( + pufferlib.pytorch.layer_init(nn.Linear(input_size + input_size // 2 + input_size // 2 + input_size // 4, hidden_size)), + nn.ReLU(), + ) + + self.actor = pufferlib.pytorch.layer_init( + nn.Linear(hidden_size, env.single_action_space.n), std=0.01) + self.value_fn = pufferlib.pytorch.layer_init( + nn.Linear(hidden_size, 1), std=1) + + def forward(self, observations, state=None): + hidden = self.encode_observations(observations) + actions, value = self.decode_actions(hidden) + return actions, value + + def forward_train(self, x, state=None): + return self.forward(x, state) + + def encode_observations(self, observations, state=None): + B = observations.shape[0] + + grid = F.one_hot(observations[:, :64].view(B, 8, 8).long(), 4).permute(0, 3, 1, 2).float() + row_clues = F.one_hot(observations[:, 64:96].view(B, 8, 4).long(), 9).float() + col_clues = F.one_hot(observations[:, 96:128].view(B, 8, 4).long(), 9).float() + board_size = F.one_hot(observations[:, 128].long(), 9).float() + + grid_feat = self.conv_grid(grid) + row_feat = self.fc_row_clues(row_clues.reshape(B, -1)) + col_feat = self.fc_col_clues(col_clues.reshape(B, -1)) + size_feat = self.fc_size(board_size) + + combined = torch.cat([grid_feat, row_feat, col_feat, size_feat], dim=-1) + features = self.proj(combined) + + return features + + def decode_actions(self, flat_hidden): + action = self.actor(flat_hidden) + value = self.value_fn(flat_hidden) + return action, value + + class Drone(nn.Module): ''' Drone policy. Flattens obs and applies a linear layer. '''