From 1ee71108569950cde5f2a8cfad8919b920f7d5f1 Mon Sep 17 00:00:00 2001 From: Asankhaya Sharma Date: Fri, 11 Jul 2025 10:37:02 +0800 Subject: [PATCH 1/4] Implement island-specific best program tracking Added tracking and updating of best programs per island to support proper island-based evolution. Updated inspiration sampling and top program queries to maintain genetic isolation between islands. Adjusted prompt context in iteration to use island-specific top programs. --- openevolve/database.py | 200 +++++++++++++++++++++++++++++++--------- openevolve/iteration.py | 10 +- 2 files changed, 163 insertions(+), 47 deletions(-) diff --git a/openevolve/database.py b/openevolve/database.py index 8aba55283..66d2c25af 100644 --- a/openevolve/database.py +++ b/openevolve/database.py @@ -122,6 +122,9 @@ def __init__(self, config: DatabaseConfig): # Track the absolute best program separately self.best_program_id: Optional[str] = None + + # Track best program per island for proper island-based evolution + self.island_best_programs: List[Optional[str]] = [None] * config.num_islands # Track the last iteration number (for resuming) self.last_iteration: int = 0 @@ -205,6 +208,9 @@ def add( # Update the absolute best program tracking (after population enforcement) self._update_best_program(program) + + # Update island-specific best program tracking + self._update_island_best_program(program, island_idx) # Save to disk if configured if self.config.db_path: @@ -315,13 +321,14 @@ def get_best_program(self, metric: Optional[str] = None) -> Optional[Program]: return sorted_programs[0] if sorted_programs else None - def get_top_programs(self, n: int = 10, metric: Optional[str] = None) -> List[Program]: + def get_top_programs(self, n: int = 10, metric: Optional[str] = None, island_idx: Optional[int] = None) -> List[Program]: """ Get the top N programs based on a metric Args: n: Number of programs to return metric: Metric to use for ranking (uses average if None) + island_idx: If specified, only return programs from this island Returns: List of top programs @@ -329,17 +336,32 @@ def get_top_programs(self, n: int = 10, metric: Optional[str] = None) -> List[Pr if not self.programs: return [] + # Get candidate programs + if island_idx is not None: + # Island-specific query + island_programs = [ + self.programs[pid] for pid in self.islands[island_idx] + if pid in self.programs + ] + candidates = island_programs + else: + # Global query + candidates = list(self.programs.values()) + + if not candidates: + return [] + if metric: # Sort by specific metric sorted_programs = sorted( - [p for p in self.programs.values() if metric in p.metrics], + [p for p in candidates if metric in p.metrics], key=lambda p: p.metrics[metric], reverse=True, ) else: # Sort by average of all numeric metrics sorted_programs = sorted( - self.programs.values(), + candidates, key=lambda p: safe_numeric_average(p.metrics), reverse=True, ) @@ -379,6 +401,7 @@ def save(self, path: Optional[str] = None, iteration: int = 0) -> None: "islands": [list(island) for island in self.islands], "archive": list(self.archive), "best_program_id": self.best_program_id, + "island_best_programs": self.island_best_programs, "last_iteration": iteration or self.last_iteration, "current_island": self.current_island, "island_generations": self.island_generations, @@ -412,6 +435,7 @@ def load(self, path: str) -> None: saved_islands = metadata.get("islands", []) self.archive = set(metadata.get("archive", [])) self.best_program_id = metadata.get("best_program_id") + self.island_best_programs = metadata.get("island_best_programs", [None] * len(saved_islands)) self.last_iteration = metadata.get("last_iteration", 0) self.current_island = metadata.get("current_island", 0) self.island_generations = metadata.get("island_generations", [0] * len(saved_islands)) @@ -440,6 +464,10 @@ def load(self, path: str) -> None: # Ensure island_generations list has correct length if len(self.island_generations) != len(self.islands): self.island_generations = [0] * len(self.islands) + + # Ensure island_best_programs list has correct length + if len(self.island_best_programs) != len(self.islands): + self.island_best_programs = [None] * len(self.islands) logger.info(f"Loaded database with {len(self.programs)} programs from {path}") @@ -748,6 +776,53 @@ def _update_best_program(self, program: Program) -> None: else: logger.info(f"New best program {program.id} replaces {old_id}") + def _update_island_best_program(self, program: Program, island_idx: int) -> None: + """ + Update the best program tracking for a specific island + + Args: + program: Program to consider as the new best for the island + island_idx: Island index + """ + # Ensure island_idx is valid + if island_idx >= len(self.island_best_programs): + logger.warning(f"Invalid island index {island_idx}, skipping island best update") + return + + # If island doesn't have a best program yet, this becomes the best + current_island_best_id = self.island_best_programs[island_idx] + if current_island_best_id is None: + self.island_best_programs[island_idx] = program.id + logger.debug(f"Set initial best program for island {island_idx} to {program.id}") + return + + # Check if current best still exists + if current_island_best_id not in self.programs: + logger.warning( + f"Island {island_idx} best program {current_island_best_id} no longer exists, updating to {program.id}" + ) + self.island_best_programs[island_idx] = program.id + return + + current_island_best = self.programs[current_island_best_id] + + # Update if the new program is better + if self._is_better(program, current_island_best): + old_id = current_island_best_id + self.island_best_programs[island_idx] = program.id + + # Log the change + if "combined_score" in program.metrics and "combined_score" in current_island_best.metrics: + old_score = current_island_best.metrics["combined_score"] + new_score = program.metrics["combined_score"] + score_diff = new_score - old_score + logger.debug( + f"Island {island_idx}: New best program {program.id} replaces {old_id} " + f"(combined_score: {old_score:.4f} → {new_score:.4f}, +{score_diff:.4f})" + ) + else: + logger.debug(f"Island {island_idx}: New best program {program.id} replaces {old_id}") + def _sample_parent(self) -> Program: """ Sample a parent program from the current island for the next evolution step @@ -869,91 +944,124 @@ def _sample_random_parent(self) -> Program: def _sample_inspirations(self, parent: Program, n: int = 5) -> List[Program]: """ - Sample inspiration programs for the next evolution step + Sample inspiration programs for the next evolution step. + + For proper island-based evolution, inspirations are sampled ONLY from the + current island, maintaining genetic isolation between islands. Args: parent: Parent program n: Number of inspirations to sample Returns: - List of inspiration programs + List of inspiration programs from the current island """ inspirations = [] + + # Get the parent's island (should be current_island) + parent_island = parent.metadata.get("island", self.current_island) + + # Get all programs from the current island + island_program_ids = list(self.islands[parent_island]) + island_programs = [self.programs[pid] for pid in island_program_ids if pid in self.programs] + + if not island_programs: + logger.warning(f"Island {parent_island} has no programs for inspiration sampling") + return [] - # Always include the absolute best program if available and different from parent + # Include the island's best program if available and different from parent + island_best_id = self.island_best_programs[parent_island] if ( - self.best_program_id is not None - and self.best_program_id != parent.id - and self.best_program_id in self.programs + island_best_id is not None + and island_best_id != parent.id + and island_best_id in self.programs ): - best_program = self.programs[self.best_program_id] - inspirations.append(best_program) - logger.debug(f"Including best program {self.best_program_id} in inspirations") - elif self.best_program_id is not None and self.best_program_id not in self.programs: - # Clean up stale best program reference + island_best = self.programs[island_best_id] + inspirations.append(island_best) + logger.debug(f"Including island {parent_island} best program {island_best_id} in inspirations") + elif island_best_id is not None and island_best_id not in self.programs: + # Clean up stale island best reference logger.warning( - f"Best program {self.best_program_id} no longer exists, clearing reference" + f"Island {parent_island} best program {island_best_id} no longer exists, clearing reference" ) - self.best_program_id = None + self.island_best_programs[parent_island] = None - # Add top programs as inspirations + # Add top programs from the island as inspirations top_n = max(1, int(n * self.config.elite_selection_ratio)) - top_programs = self.get_top_programs(n=top_n) - for program in top_programs: + top_island_programs = self.get_top_programs(n=top_n, island_idx=parent_island) + for program in top_island_programs: if program.id not in [p.id for p in inspirations] and program.id != parent.id: inspirations.append(program) - # Add diverse programs using config.num_diverse_programs - if len(self.programs) > n and len(inspirations) < n: - # Calculate how many diverse programs to add (up to remaining slots) + # Add diverse programs from within the island + if len(island_programs) > n and len(inspirations) < n: remaining_slots = n - len(inspirations) - # Sample from different feature cells for diversity + # Try to sample from different feature cells within the island feature_coords = self._calculate_feature_coords(parent) - - # Get programs from nearby feature cells nearby_programs = [] - for _ in range(remaining_slots): + + # Create a mapping of feature cells to island programs for efficient lookup + island_feature_map = {} + for prog_id in island_program_ids: + if prog_id in self.programs: + prog = self.programs[prog_id] + prog_coords = self._calculate_feature_coords(prog) + cell_key = self._feature_coords_to_key(prog_coords) + island_feature_map[cell_key] = prog_id + + # Try to find programs from nearby feature cells within the island + for _ in range(remaining_slots * 3): # Try more times to find nearby programs # Perturb coordinates perturbed_coords = [ - max(0, min(self.feature_bins - 1, c + random.randint(-1, 1))) + max(0, min(self.feature_bins - 1, c + random.randint(-2, 2))) for c in feature_coords ] - - # Try to get program from this cell + cell_key = self._feature_coords_to_key(perturbed_coords) - if cell_key in self.feature_map: - program_id = self.feature_map[cell_key] - # Check if program still exists before adding + if cell_key in island_feature_map: + program_id = island_feature_map[cell_key] if ( program_id != parent.id and program_id not in [p.id for p in inspirations] + and program_id not in [p.id for p in nearby_programs] and program_id in self.programs ): nearby_programs.append(self.programs[program_id]) - elif program_id not in self.programs: - # Clean up stale reference in feature_map - logger.debug(f"Removing stale program {program_id} from feature_map") - del self.feature_map[cell_key] + if len(nearby_programs) >= remaining_slots: + break - # If we need more, add random programs + # If we still need more, add random programs from the island if len(inspirations) + len(nearby_programs) < n: remaining = n - len(inspirations) - len(nearby_programs) - all_ids = set(self.programs.keys()) + + # Get available programs from the island excluded_ids = ( {parent.id} .union(p.id for p in inspirations) .union(p.id for p in nearby_programs) ) - available_ids = list(all_ids - excluded_ids) - - if available_ids: - random_ids = random.sample(available_ids, min(remaining, len(available_ids))) + available_island_ids = [ + pid for pid in island_program_ids + if pid not in excluded_ids and pid in self.programs + ] + + if available_island_ids: + random_ids = random.sample( + available_island_ids, + min(remaining, len(available_island_ids)) + ) random_programs = [self.programs[pid] for pid in random_ids] nearby_programs.extend(random_programs) inspirations.extend(nearby_programs) + # Log island isolation info + logger.debug( + f"Sampled {len(inspirations)} inspirations from island {parent_island} " + f"(island has {len(island_programs)} programs total)" + ) + return inspirations[:n] def _enforce_population_limit(self, exclude_program_id: Optional[str] = None) -> None: @@ -1103,6 +1211,9 @@ def migrate_programs(self) -> None: # Add to target island self.islands[target_island].add(migrant_copy.id) self.programs[migrant_copy.id] = migrant_copy + + # Update island-specific best program if migrant is better + self._update_island_best_program(migrant_copy, target_island) logger.debug( f"Migrated program {migrant.id} from island {i} to island {target_island}" @@ -1214,10 +1325,13 @@ def log_island_status(self) -> None: logger.info("Island Status:") for stat in stats: current_marker = " *" if stat["is_current"] else " " + island_idx = stat['island'] + island_best_id = self.island_best_programs[island_idx] if island_idx < len(self.island_best_programs) else None + best_indicator = f" (best: {island_best_id})" if island_best_id else "" logger.info( f"{current_marker} Island {stat['island']}: {stat['population_size']} programs, " f"best={stat['best_score']:.4f}, avg={stat['average_score']:.4f}, " - f"diversity={stat['diversity']:.2f}, gen={stat['generation']}" + f"diversity={stat['diversity']:.2f}, gen={stat['generation']}{best_indicator}" ) # Artifact storage and retrieval methods diff --git a/openevolve/iteration.py b/openevolve/iteration.py index 98db88f09..11d3453a8 100644 --- a/openevolve/iteration.py +++ b/openevolve/iteration.py @@ -53,16 +53,18 @@ async def run_iteration_with_shared_db( # Get artifacts for the parent program if available parent_artifacts = database.get_artifacts(parent.id) - # Get actual top programs for prompt context (separate from inspirations) - actual_top_programs = database.get_top_programs(5) + # Get island-specific top programs for prompt context (maintain island isolation) + parent_island = parent.metadata.get("island", database.current_island) + island_top_programs = database.get_top_programs(5, island_idx=parent_island) + island_previous_programs = database.get_top_programs(3, island_idx=parent_island) # Build prompt prompt = prompt_sampler.build_prompt( current_program=parent.code, parent_program=parent.code, program_metrics=parent.metrics, - previous_programs=[p.to_dict() for p in database.get_top_programs(3)], - top_programs=[p.to_dict() for p in actual_top_programs], + previous_programs=[p.to_dict() for p in island_previous_programs], + top_programs=[p.to_dict() for p in island_top_programs], inspirations=[p.to_dict() for p in inspirations], language=config.language, evolution_round=iteration, From 545557ae05ec63d2c043bc493452c5ec36ca819f Mon Sep 17 00:00:00 2001 From: Asankhaya Sharma Date: Fri, 11 Jul 2025 10:53:21 +0800 Subject: [PATCH 2/4] Add cascade evaluation config validation and update YAML Added validation of cascade evaluation configuration in Evaluator to warn if cascade functions are missing or incomplete. Updated config.yaml to set cascade_evaluation to false, reflecting that evaluator does not implement cascade functions. Improved _direct_evaluate to support both dict and EvaluationResult returns. --- examples/rust_adaptive_sort/config.yaml | 8 ++--- openevolve/evaluator.py | 43 +++++++++++++++++++++---- 2 files changed, 38 insertions(+), 13 deletions(-) diff --git a/examples/rust_adaptive_sort/config.yaml b/examples/rust_adaptive_sort/config.yaml index 0f5649d5f..497942891 100644 --- a/examples/rust_adaptive_sort/config.yaml +++ b/examples/rust_adaptive_sort/config.yaml @@ -49,9 +49,5 @@ evaluator: timeout: 60 # Rust compilation can take time parallel_evaluations: 3 - # Use cascade evaluation for performance testing - cascade_evaluation: true - cascade_thresholds: - - 0.5 # Compilation success and basic correctness - - 0.7 # Good performance - - 0.85 # Excellent adaptability \ No newline at end of file + # Direct evaluation - evaluator doesn't implement cascade functions + cascade_evaluation: false \ No newline at end of file diff --git a/openevolve/evaluator.py b/openevolve/evaluator.py index dfe966f50..42a3be93d 100644 --- a/openevolve/evaluator.py +++ b/openevolve/evaluator.py @@ -89,10 +89,42 @@ def _load_evaluation_function(self) -> None: self.evaluate_function = module.evaluate logger.info(f"Successfully loaded evaluation function from {self.evaluation_file}") + + # Validate cascade configuration + self._validate_cascade_configuration(module) except Exception as e: logger.error(f"Error loading evaluation function: {str(e)}") raise + def _validate_cascade_configuration(self, module) -> None: + """ + Validate cascade evaluation configuration and warn about potential issues + + Args: + module: The loaded evaluation module + """ + if self.config.cascade_evaluation: + # Check if cascade functions exist + has_stage1 = hasattr(module, "evaluate_stage1") + has_stage2 = hasattr(module, "evaluate_stage2") + has_stage3 = hasattr(module, "evaluate_stage3") + + if not has_stage1: + logger.warning( + f"Configuration has 'cascade_evaluation: true' but evaluator " + f"'{self.evaluation_file}' does not define 'evaluate_stage1' function. " + f"This will fall back to direct evaluation, making the cascade setting useless. " + f"Consider setting 'cascade_evaluation: false' or implementing cascade functions." + ) + elif not (has_stage2 or has_stage3): + logger.warning( + f"Evaluator '{self.evaluation_file}' defines 'evaluate_stage1' but no additional " + f"cascade stages (evaluate_stage2, evaluate_stage3). Consider implementing " + f"multi-stage evaluation for better cascade benefits." + ) + else: + logger.debug(f"Cascade evaluation properly configured with available stage functions") + async def evaluate_program( self, program_code: str, @@ -273,7 +305,7 @@ def get_pending_artifacts(self, program_id: str) -> Optional[Dict[str, Union[str """ return self._pending_artifacts.pop(program_id, None) - async def _direct_evaluate(self, program_path: str) -> Dict[str, float]: + async def _direct_evaluate(self, program_path: str) -> Union[Dict[str, float], EvaluationResult]: """ Directly evaluate a program using the evaluation function with timeout @@ -281,7 +313,7 @@ async def _direct_evaluate(self, program_path: str) -> Dict[str, float]: program_path: Path to the program file Returns: - Dictionary of metric name to score + Dictionary of metrics or EvaluationResult with metrics and artifacts Raises: asyncio.TimeoutError: If evaluation exceeds timeout @@ -296,11 +328,8 @@ async def run_evaluation(): # Run the evaluation with timeout - let exceptions bubble up for retry handling result = await asyncio.wait_for(run_evaluation(), timeout=self.config.timeout) - # Validate result - if not isinstance(result, dict): - logger.warning(f"Evaluation returned non-dictionary result: {result}") - return {"error": 0.0} - + # Return result as-is to be processed by _process_evaluation_result + # This supports both dict and EvaluationResult returns, just like _cascade_evaluate return result async def _cascade_evaluate( From 930eaa595a202475e6df29d04b846a48dbbc8f1a Mon Sep 17 00:00:00 2001 From: Asankhaya Sharma Date: Fri, 11 Jul 2025 11:11:23 +0800 Subject: [PATCH 3/4] Add MAP-Elites logging, island migration validation, and tests Enhanced ProgramDatabase with detailed MAP-Elites cell logging, coverage milestones, and cell improvement events. Added validation and cleanup for island best program tracking and migration consistency. Improved Evaluator error context for cascade failures. Added comprehensive tests for cascade validation, island migration, and island best program tracking. --- openevolve/database.py | 126 ++++++++++++- openevolve/evaluator.py | 33 +++- tests/test_cascade_validation.py | 301 +++++++++++++++++++++++++++++++ tests/test_database.py | 188 +++++++++++++++++++ tests/test_island_migration.py | 252 ++++++++++++++++++++++++++ tests/test_island_tracking.py | 266 +++++++++++++++++++++++++++ 6 files changed, 1158 insertions(+), 8 deletions(-) create mode 100644 tests/test_cascade_validation.py create mode 100644 tests/test_island_migration.py create mode 100644 tests/test_island_tracking.py diff --git a/openevolve/database.py b/openevolve/database.py index 66d2c25af..253b66fd5 100644 --- a/openevolve/database.py +++ b/openevolve/database.py @@ -189,6 +189,28 @@ def add( should_replace = self._is_better(program, self.programs[existing_program_id]) if should_replace: + # Log significant MAP-Elites events + coords_dict = {self.config.feature_dimensions[i]: feature_coords[i] for i in range(len(feature_coords))} + + if feature_key not in self.feature_map: + # New cell occupation + logging.info("New MAP-Elites cell occupied: %s", coords_dict) + # Check coverage milestone + total_possible_cells = self.feature_bins ** len(self.config.feature_dimensions) + coverage = (len(self.feature_map) + 1) / total_possible_cells + if coverage in [0.1, 0.25, 0.5, 0.75, 0.9]: + logging.info("MAP-Elites coverage reached %.1f%% (%d/%d cells)", + coverage * 100, len(self.feature_map) + 1, total_possible_cells) + else: + # Cell replacement - existing program being replaced + existing_program_id = self.feature_map[feature_key] + if existing_program_id in self.programs: + existing_program = self.programs[existing_program_id] + new_fitness = safe_numeric_average(program.metrics) + existing_fitness = safe_numeric_average(existing_program.metrics) + logging.info("MAP-Elites cell improved: %s (fitness: %.3f -> %.3f)", + coords_dict, existing_fitness, new_fitness) + self.feature_map[feature_key] = program.id # Add to specific island (not random!) @@ -515,6 +537,9 @@ def _reconstruct_islands(self, saved_islands: List[List[str]]) -> None: feature_keys_to_remove.append(key) for key in feature_keys_to_remove: del self.feature_map[key] + + # Clean up island best programs - remove stale references + self._cleanup_stale_island_bests() # Check best program if self.best_program_id and self.best_program_id not in self.programs: @@ -641,7 +666,8 @@ def _calculate_feature_coords(self, program: Program) -> List[int]: else: # Default to middle bin if feature not found coords.append(self.feature_bins // 2) - logging.info( + # Only log coordinates at debug level for troubleshooting + logging.debug( "MAP-Elites coords: %s", str({self.config.feature_dimensions[i]: coords[i] for i in range(len(coords))}), ) @@ -1138,6 +1164,9 @@ def _enforce_population_limit(self, exclude_program_id: Optional[str] = None) -> logger.debug(f"Removed program {program_id} due to population limit") logger.info(f"Population size after cleanup: {len(self.programs)}") + + # Clean up any stale island best program references after removal + self._cleanup_stale_island_bests() # Island management methods def set_current_island(self, island_idx: int) -> None: @@ -1215,13 +1244,102 @@ def migrate_programs(self) -> None: # Update island-specific best program if migrant is better self._update_island_best_program(migrant_copy, target_island) - logger.debug( - f"Migrated program {migrant.id} from island {i} to island {target_island}" - ) + # Log migration with MAP-Elites coordinates + feature_coords = self._calculate_feature_coords(migrant_copy) + coords_dict = {self.config.feature_dimensions[j]: feature_coords[j] for j in range(len(feature_coords))} + logger.info("Program migrated to island %d at MAP-Elites coords: %s", + target_island, coords_dict) # Update last migration generation self.last_migration_generation = max(self.island_generations) logger.info(f"Migration completed at generation {self.last_migration_generation}") + + # Validate migration results + self._validate_migration_results() + + def _validate_migration_results(self) -> None: + """ + Validate migration didn't create inconsistencies + + Checks that: + 1. Program island metadata matches actual island assignment + 2. No programs are assigned to multiple islands + 3. All island best programs exist and are in correct islands + """ + seen_program_ids = set() + + for i, island in enumerate(self.islands): + for program_id in island: + # Check for duplicate assignments + if program_id in seen_program_ids: + logger.error(f"Program {program_id} assigned to multiple islands") + continue + seen_program_ids.add(program_id) + + # Check program exists + if program_id not in self.programs: + logger.warning(f"Island {i} contains nonexistent program {program_id}") + continue + + # Check metadata consistency + program = self.programs[program_id] + stored_island = program.metadata.get("island") + if stored_island != i: + logger.warning( + f"Island mismatch for program {program_id}: " + f"in island {i} but metadata says {stored_island}" + ) + + # Validate island best programs + for i, best_id in enumerate(self.island_best_programs): + if best_id is not None: + if best_id not in self.programs: + logger.warning(f"Island {i} best program {best_id} does not exist") + elif best_id not in self.islands[i]: + logger.warning(f"Island {i} best program {best_id} not in island") + + def _cleanup_stale_island_bests(self) -> None: + """ + Remove stale island best program references + + Cleans up references to programs that no longer exist in the database + or are not actually in their assigned islands. + """ + cleaned_count = 0 + + for i, best_id in enumerate(self.island_best_programs): + if best_id is not None: + should_clear = False + + # Check if program still exists + if best_id not in self.programs: + logger.debug(f"Clearing stale island {i} best program {best_id} (program deleted)") + should_clear = True + # Check if program is still in the island + elif best_id not in self.islands[i]: + logger.debug(f"Clearing stale island {i} best program {best_id} (not in island)") + should_clear = True + + if should_clear: + self.island_best_programs[i] = None + cleaned_count += 1 + + if cleaned_count > 0: + logger.info(f"Cleaned up {cleaned_count} stale island best program references") + + # Recalculate best programs for islands that were cleared + for i, best_id in enumerate(self.island_best_programs): + if best_id is None and len(self.islands[i]) > 0: + # Find new best program for this island + island_programs = [self.programs[pid] for pid in self.islands[i] if pid in self.programs] + if island_programs: + # Sort by fitness and update + best_program = max( + island_programs, + key=lambda p: p.metrics.get("combined_score", safe_numeric_average(p.metrics)) + ) + self.island_best_programs[i] = best_program.id + logger.debug(f"Recalculated island {i} best program: {best_program.id}") def get_island_stats(self) -> List[dict]: """Get statistics for each island""" diff --git a/openevolve/evaluator.py b/openevolve/evaluator.py index 42a3be93d..2ab93f361 100644 --- a/openevolve/evaluator.py +++ b/openevolve/evaluator.py @@ -383,13 +383,14 @@ async def run_stage1(): ) except Exception as e: logger.error(f"Error in stage 1 evaluation: {str(e)}") - # Capture stage 1 failure as artifacts + # Capture stage 1 failure with enhanced context + error_context = self._create_cascade_error_context("stage1", e) return EvaluationResult( metrics={"stage1_passed": 0.0, "error": 0.0}, artifacts={ "stderr": str(e), "traceback": traceback.format_exc(), - "failure_stage": "stage1", + **error_context, }, ) @@ -510,13 +511,14 @@ async def run_stage3(): except Exception as e: logger.error(f"Error in cascade evaluation: {str(e)}") - # Return proper cascade failure result instead of re-raising + # Return proper cascade failure result with enhanced context + error_context = self._create_cascade_error_context("cascade_setup", e) return EvaluationResult( metrics={"stage1_passed": 0.0, "error": 0.0}, artifacts={ "stderr": str(e), "traceback": traceback.format_exc(), - "failure_stage": "cascade_setup", + **error_context, }, ) @@ -611,6 +613,29 @@ async def _llm_evaluate(self, program_code: str, program_id: str = "") -> Dict[s traceback.print_exc() return {} + def _create_cascade_error_context(self, stage: str, error: Exception) -> dict: + """ + Create rich error context for cascade failures + + Args: + stage: The stage where the error occurred + error: The exception that was raised + + Returns: + Dictionary with enhanced error context + """ + import time + return { + "failure_stage": stage, + "error_type": type(error).__name__, + "error_message": str(error), + "timestamp": time.time(), + "cascade_config": self.config.cascade_evaluation, + "cascade_thresholds": getattr(self.config, 'cascade_thresholds', []), + "timeout_config": self.config.timeout, + "evaluation_file": self.evaluation_file, + } + def _passes_threshold(self, metrics: Dict[str, float], threshold: float) -> bool: """ Check if metrics pass a threshold diff --git a/tests/test_cascade_validation.py b/tests/test_cascade_validation.py new file mode 100644 index 000000000..0464b4278 --- /dev/null +++ b/tests/test_cascade_validation.py @@ -0,0 +1,301 @@ +""" +Tests for cascade evaluation validation functionality in openevolve.evaluator +""" + +import unittest +import tempfile +import os +from unittest.mock import patch, MagicMock +from openevolve.config import Config +from openevolve.evaluator import Evaluator +from openevolve.database import EvaluationResult + + +class TestCascadeValidation(unittest.TestCase): + """Tests for cascade evaluation configuration validation""" + + def setUp(self): + """Set up test evaluator with cascade validation""" + self.config = Config() + + # Create temporary evaluator files for testing + self.temp_dir = tempfile.mkdtemp() + + def tearDown(self): + """Clean up temporary files""" + # Clean up temp files + for file in os.listdir(self.temp_dir): + os.remove(os.path.join(self.temp_dir, file)) + os.rmdir(self.temp_dir) + + def _create_evaluator_file(self, filename: str, content: str) -> str: + """Helper to create temporary evaluator file""" + file_path = os.path.join(self.temp_dir, filename) + with open(file_path, 'w') as f: + f.write(content) + return file_path + + def test_cascade_validation_with_valid_evaluator(self): + """Test cascade validation with evaluator that has cascade functions""" + # Create evaluator with cascade functions + evaluator_content = ''' +def evaluate_stage1(program_path): + return {"stage1_score": 0.5} + +def evaluate_stage2(program_path): + return {"stage2_score": 0.7} + +def evaluate_stage3(program_path): + return {"stage3_score": 0.9} + +def evaluate(program_path): + return {"final_score": 1.0} +''' + evaluator_path = self._create_evaluator_file("valid_cascade.py", evaluator_content) + + # Configure for cascade evaluation + self.config.evaluator.cascade_evaluation = True + self.config.evaluator.evaluation_file = evaluator_path + + # Should not raise warnings for valid cascade evaluator + with patch('openevolve.evaluator.logger') as mock_logger: + evaluator = Evaluator(self.config.evaluator, None) + + # Should not have called warning + mock_logger.warning.assert_not_called() + + def test_cascade_validation_warning_for_missing_functions(self): + """Test cascade validation warns when cascade functions are missing""" + # Create evaluator without cascade functions + evaluator_content = ''' +def evaluate(program_path): + return {"score": 0.5} +''' + evaluator_path = self._create_evaluator_file("no_cascade.py", evaluator_content) + + # Configure for cascade evaluation + self.config.evaluator.cascade_evaluation = True + self.config.evaluator.evaluation_file = evaluator_path + + # Should warn about missing cascade functions + with patch('openevolve.evaluator.logger') as mock_logger: + evaluator = Evaluator(self.config.evaluator, None) + + # Should have warned about missing stage functions + mock_logger.warning.assert_called() + warning_call = mock_logger.warning.call_args[0][0] + self.assertIn("cascade_evaluation: true", warning_call) + self.assertIn("evaluate_stage1", warning_call) + + def test_cascade_validation_partial_functions(self): + """Test cascade validation with only some cascade functions""" + # Create evaluator with only stage1 + evaluator_content = ''' +def evaluate_stage1(program_path): + return {"stage1_score": 0.5} + +def evaluate(program_path): + return {"score": 0.5} +''' + evaluator_path = self._create_evaluator_file("partial_cascade.py", evaluator_content) + + # Configure for cascade evaluation + self.config.evaluator.cascade_evaluation = True + self.config.evaluator.evaluation_file = evaluator_path + + # Should not warn since stage1 exists (minimum requirement) + with patch('openevolve.evaluator.logger') as mock_logger: + evaluator = Evaluator(self.config.evaluator, None) + + # Should not warn since stage1 exists + mock_logger.warning.assert_not_called() + + def test_no_cascade_validation_when_disabled(self): + """Test no validation when cascade evaluation is disabled""" + # Create evaluator without cascade functions + evaluator_content = ''' +def evaluate(program_path): + return {"score": 0.5} +''' + evaluator_path = self._create_evaluator_file("no_cascade.py", evaluator_content) + + # Configure WITHOUT cascade evaluation + self.config.evaluator.cascade_evaluation = False + self.config.evaluator.evaluation_file = evaluator_path + + # Should not perform validation or warn + with patch('openevolve.evaluator.logger') as mock_logger: + evaluator = Evaluator(self.config.evaluator, None) + + # Should not warn when cascade evaluation is disabled + mock_logger.warning.assert_not_called() + + def test_direct_evaluate_supports_evaluation_result(self): + """Test that _direct_evaluate supports EvaluationResult returns""" + # Create evaluator that returns EvaluationResult + evaluator_content = ''' +from openevolve.database import EvaluationResult + +def evaluate(program_path): + return EvaluationResult( + metrics={"score": 0.8, "accuracy": 0.9}, + artifacts={"debug_info": "test data"} + ) +''' + evaluator_path = self._create_evaluator_file("result_evaluator.py", evaluator_content) + + self.config.evaluator.cascade_evaluation = False + self.config.evaluator.evaluation_file = evaluator_path + self.config.evaluator.timeout = 10 + + evaluator = Evaluator(self.config.evaluator, None) + + # Create a dummy program file + program_path = self._create_evaluator_file("test_program.py", "def test(): pass") + + # Mock the evaluation process + with patch('openevolve.evaluator.run_external_evaluator') as mock_run: + mock_run.return_value = EvaluationResult( + metrics={"score": 0.8, "accuracy": 0.9}, + artifacts={"debug_info": "test data"} + ) + + # Should handle EvaluationResult without issues + result = evaluator._direct_evaluate(program_path) + + # Should return the EvaluationResult as-is + self.assertIsInstance(result, EvaluationResult) + self.assertEqual(result.metrics["score"], 0.8) + self.assertEqual(result.artifacts["debug_info"], "test data") + + def test_direct_evaluate_supports_dict_result(self): + """Test that _direct_evaluate still supports dict returns""" + # Create evaluator that returns dict + evaluator_content = ''' +def evaluate(program_path): + return {"score": 0.7, "performance": 0.85} +''' + evaluator_path = self._create_evaluator_file("dict_evaluator.py", evaluator_content) + + self.config.evaluator.cascade_evaluation = False + self.config.evaluator.evaluation_file = evaluator_path + self.config.evaluator.timeout = 10 + + evaluator = Evaluator(self.config.evaluator, None) + + # Create a dummy program file + program_path = self._create_evaluator_file("test_program.py", "def test(): pass") + + # Mock the evaluation process + with patch('openevolve.evaluator.run_external_evaluator') as mock_run: + mock_run.return_value = {"score": 0.7, "performance": 0.85} + + # Should handle dict result without issues + result = evaluator._direct_evaluate(program_path) + + # Should return the dict as-is + self.assertIsInstance(result, dict) + self.assertEqual(result["score"], 0.7) + self.assertEqual(result["performance"], 0.85) + + def test_cascade_validation_with_class_based_evaluator(self): + """Test cascade validation with class-based evaluator""" + # Create class-based evaluator + evaluator_content = ''' +class Evaluator: + def evaluate_stage1(self, program_path): + return {"stage1_score": 0.5} + + def evaluate(self, program_path): + return {"score": 0.5} + +# Module-level functions (what validation looks for) +def evaluate_stage1(program_path): + evaluator = Evaluator() + return evaluator.evaluate_stage1(program_path) + +def evaluate(program_path): + evaluator = Evaluator() + return evaluator.evaluate(program_path) +''' + evaluator_path = self._create_evaluator_file("class_cascade.py", evaluator_content) + + # Configure for cascade evaluation + self.config.evaluator.cascade_evaluation = True + self.config.evaluator.evaluation_file = evaluator_path + + # Should not warn since module-level functions exist + with patch('openevolve.evaluator.logger') as mock_logger: + evaluator = Evaluator(self.config.evaluator, None) + + mock_logger.warning.assert_not_called() + + def test_cascade_validation_with_syntax_error(self): + """Test cascade validation handles syntax errors gracefully""" + # Create evaluator with syntax error + evaluator_content = ''' +def evaluate_stage1(program_path) # Missing colon + return {"stage1_score": 0.5} +''' + evaluator_path = self._create_evaluator_file("syntax_error.py", evaluator_content) + + # Configure for cascade evaluation + self.config.evaluator.cascade_evaluation = True + self.config.evaluator.evaluation_file = evaluator_path + + # Should handle syntax error and still warn about cascade + with patch('openevolve.evaluator.logger') as mock_logger: + evaluator = Evaluator(self.config.evaluator, None) + + # Should have warned about missing functions (due to import failure) + mock_logger.warning.assert_called() + + def test_cascade_validation_nonexistent_file(self): + """Test cascade validation with nonexistent evaluator file""" + # Configure with nonexistent file + self.config.evaluator.cascade_evaluation = True + self.config.evaluator.evaluation_file = "/nonexistent/path.py" + + # Should handle missing file gracefully + with patch('openevolve.evaluator.logger') as mock_logger: + evaluator = Evaluator(self.config.evaluator, None) + + # Should have warned about missing functions (due to import failure) + mock_logger.warning.assert_called() + + def test_process_evaluation_result_with_artifacts(self): + """Test that _process_evaluation_result handles artifacts correctly""" + evaluator_path = self._create_evaluator_file("dummy.py", "def evaluate(p): pass") + + self.config.evaluator.evaluation_file = evaluator_path + evaluator = Evaluator(self.config.evaluator, None) + + # Test with EvaluationResult containing artifacts + eval_result = EvaluationResult( + metrics={"score": 0.9}, + artifacts={"log": "test log", "data": [1, 2, 3]} + ) + + metrics, artifacts = evaluator._process_evaluation_result(eval_result) + + self.assertEqual(metrics, {"score": 0.9}) + self.assertEqual(artifacts, {"log": "test log", "data": [1, 2, 3]}) + + def test_process_evaluation_result_with_dict(self): + """Test that _process_evaluation_result handles dict results correctly""" + evaluator_path = self._create_evaluator_file("dummy.py", "def evaluate(p): pass") + + self.config.evaluator.evaluation_file = evaluator_path + evaluator = Evaluator(self.config.evaluator, None) + + # Test with dict result + dict_result = {"score": 0.7, "accuracy": 0.8} + + metrics, artifacts = evaluator._process_evaluation_result(dict_result) + + self.assertEqual(metrics, {"score": 0.7, "accuracy": 0.8}) + self.assertEqual(artifacts, {}) + + +if __name__ == "__main__": + unittest.main() \ No newline at end of file diff --git a/tests/test_database.py b/tests/test_database.py index bfa35040c..883538eb3 100644 --- a/tests/test_database.py +++ b/tests/test_database.py @@ -80,6 +80,194 @@ def test_sample(self): self.assertIsNotNone(parent) self.assertIn(parent.id, ["test1", "test2"]) + def test_island_operations_basic(self): + """Test basic island operations""" + # Test with default islands (should be 5 by default) + self.assertEqual(len(self.db.islands), 5) + + program = Program( + id="island_test", + code="def island_test(): pass", + language="python", + metrics={"score": 0.6}, + ) + + self.db.add(program) + + # Should be in island 0 + self.assertIn("island_test", self.db.islands[0]) + self.assertEqual(program.metadata.get("island"), 0) + + def test_multi_island_setup(self): + """Test database with multiple islands""" + # Create new database with multiple islands + config = Config() + config.database.in_memory = True + config.database.num_islands = 3 + multi_db = ProgramDatabase(config.database) + + self.assertEqual(len(multi_db.islands), 3) + self.assertEqual(len(multi_db.island_best_programs), 3) + + # Add programs to specific islands + for i in range(3): + program = Program( + id=f"test_island_{i}", + code=f"def test_{i}(): pass", + language="python", + metrics={"score": 0.5 + i * 0.1}, + ) + multi_db.add(program, target_island=i) + + # Verify assignment + self.assertIn(f"test_island_{i}", multi_db.islands[i]) + self.assertEqual(program.metadata.get("island"), i) + + def test_feature_coordinates_calculation(self): + """Test MAP-Elites feature coordinate calculation""" + program = Program( + id="feature_test", + code="def test(): pass", # Short code + language="python", + metrics={"score": 0.8}, + ) + + coords = self.db._calculate_feature_coords(program) + + # Should return list of coordinates + self.assertIsInstance(coords, list) + self.assertEqual(len(coords), len(self.db.config.feature_dimensions)) + + # All coordinates should be within valid range + for coord in coords: + self.assertGreaterEqual(coord, 0) + self.assertLess(coord, self.db.feature_bins) + + def test_feature_map_operations(self): + """Test feature map operations for MAP-Elites""" + program1 = Program( + id="map_test1", + code="def short(): pass", # Similar complexity + language="python", + metrics={"score": 0.5}, + ) + + program2 = Program( + id="map_test2", + code="def also_short(): pass", # Similar complexity + language="python", + metrics={"score": 0.8}, # Better score + ) + + self.db.add(program1) + self.db.add(program2) + + # Both programs might land in same cell due to similar features + # The better program should be kept in the feature map + feature_coords1 = self.db._calculate_feature_coords(program1) + feature_coords2 = self.db._calculate_feature_coords(program2) + + key1 = self.db._feature_coords_to_key(feature_coords1) + key2 = self.db._feature_coords_to_key(feature_coords2) + + if key1 == key2: # Same cell + # Better program should be in feature map + self.assertEqual(self.db.feature_map[key1], "map_test2") + else: # Different cells + # Both should be in feature map + self.assertEqual(self.db.feature_map[key1], "map_test1") + self.assertEqual(self.db.feature_map[key2], "map_test2") + + def test_get_top_programs_with_metrics(self): + """Test get_top_programs with specific metrics""" + program1 = Program( + id="metric_test1", + code="def test1(): pass", + language="python", + metrics={"accuracy": 0.9, "speed": 0.3}, + ) + + program2 = Program( + id="metric_test2", + code="def test2(): pass", + language="python", + metrics={"accuracy": 0.7, "speed": 0.8}, + ) + + self.db.add(program1) + self.db.add(program2) + + # Test sorting by specific metric + top_by_accuracy = self.db.get_top_programs(n=2, metric="accuracy") + self.assertEqual(top_by_accuracy[0].id, "metric_test1") # Higher accuracy + + top_by_speed = self.db.get_top_programs(n=2, metric="speed") + self.assertEqual(top_by_speed[0].id, "metric_test2") # Higher speed + + def test_archive_operations(self): + """Test archive functionality""" + # Add programs that should go into archive + for i in range(5): + program = Program( + id=f"archive_test_{i}", + code=f"def test_{i}(): return {i}", + language="python", + metrics={"score": i * 0.1}, + ) + self.db.add(program) + + # Archive should contain program IDs + self.assertGreater(len(self.db.archive), 0) + self.assertLessEqual(len(self.db.archive), self.db.config.archive_size) + + # Archive should contain program IDs that exist + for program_id in self.db.archive: + self.assertIn(program_id, self.db.programs) + + def test_best_program_tracking(self): + """Test absolute best program tracking""" + program1 = Program( + id="best_test1", + code="def test1(): pass", + language="python", + metrics={"combined_score": 0.6}, + ) + + program2 = Program( + id="best_test2", + code="def test2(): pass", + language="python", + metrics={"combined_score": 0.9}, + ) + + self.db.add(program1) + self.assertEqual(self.db.best_program_id, "best_test1") + + self.db.add(program2) + self.assertEqual(self.db.best_program_id, "best_test2") # Should update to better program + + def test_population_limit_enforcement(self): + """Test population size limit enforcement""" + # Set small population limit + original_limit = self.db.config.population_size + self.db.config.population_size = 3 + + # Add more programs than limit + for i in range(5): + program = Program( + id=f"limit_test_{i}", + code=f"def test_{i}(): pass", + language="python", + metrics={"score": i * 0.1}, + ) + self.db.add(program) + + # Population should be at or below limit + self.assertLessEqual(len(self.db.programs), 3) + + # Restore original limit + self.db.config.population_size = original_limit + if __name__ == "__main__": unittest.main() diff --git a/tests/test_island_migration.py b/tests/test_island_migration.py new file mode 100644 index 000000000..efde4e37b --- /dev/null +++ b/tests/test_island_migration.py @@ -0,0 +1,252 @@ +""" +Tests for island migration functionality in openevolve.database +""" + +import unittest +from openevolve.config import Config +from openevolve.database import Program, ProgramDatabase + + +class TestIslandMigration(unittest.TestCase): + """Tests for island migration in program database""" + + def setUp(self): + """Set up test database with multiple islands""" + config = Config() + config.database.in_memory = True + config.database.num_islands = 3 + config.database.migration_rate = 0.5 # 50% of programs migrate + config.database.migration_generations = 5 # Migrate every 5 generations + self.db = ProgramDatabase(config.database) + + def _create_test_program(self, program_id: str, score: float, island: int) -> Program: + """Helper to create a test program""" + program = Program( + id=program_id, + code=f"def func_{program_id}(): return {score}", + language="python", + metrics={"score": score, "combined_score": score}, + metadata={"island": island} + ) + return program + + def test_initial_island_setup(self): + """Test that islands are properly initialized""" + self.assertEqual(len(self.db.islands), 3) + self.assertEqual(len(self.db.island_best_programs), 3) + self.assertEqual(len(self.db.island_generations), 3) + + # All islands should be empty initially + for island in self.db.islands: + self.assertEqual(len(island), 0) + + # All island best programs should be None initially + for best_id in self.db.island_best_programs: + self.assertIsNone(best_id) + + def test_program_island_assignment(self): + """Test that programs are assigned to correct islands""" + # Add programs to specific islands + program1 = self._create_test_program("test1", 0.5, 0) + program2 = self._create_test_program("test2", 0.7, 1) + program3 = self._create_test_program("test3", 0.3, 2) + + self.db.add(program1, target_island=0) + self.db.add(program2, target_island=1) + self.db.add(program3, target_island=2) + + # Verify island assignments + self.assertIn("test1", self.db.islands[0]) + self.assertIn("test2", self.db.islands[1]) + self.assertIn("test3", self.db.islands[2]) + + # Verify metadata + self.assertEqual(self.db.programs["test1"].metadata["island"], 0) + self.assertEqual(self.db.programs["test2"].metadata["island"], 1) + self.assertEqual(self.db.programs["test3"].metadata["island"], 2) + + def test_should_migrate_logic(self): + """Test the migration timing logic""" + # Initially should not migrate (no generations passed) + self.assertFalse(self.db.should_migrate()) + + # Advance island generations + self.db.island_generations = [5, 6, 7] # All above threshold + self.assertTrue(self.db.should_migrate()) + + # Test with mixed generations + self.db.island_generations = [3, 6, 2] # Only one above threshold + self.assertFalse(self.db.should_migrate()) + + def test_migration_ring_topology(self): + """Test that migration follows ring topology""" + # Add programs to islands 0 and 1 + program1 = self._create_test_program("test1", 0.8, 0) + program2 = self._create_test_program("test2", 0.6, 1) + + self.db.add(program1, target_island=0) + self.db.add(program2, target_island=1) + + # Set up for migration + self.db.island_generations = [6, 6, 6] # Trigger migration + + initial_program_count = len(self.db.programs) + + # Perform migration + self.db.migrate_programs() + + # Should have created migrant copies + self.assertGreater(len(self.db.programs), initial_program_count) + + # Check that migrants were created with proper naming + migrant_ids = [pid for pid in self.db.programs.keys() if "_migrant_" in pid] + self.assertGreater(len(migrant_ids), 0) + + # Verify ring topology: island 0 -> islands 1,2; island 1 -> islands 2,0 + island_0_migrants = [pid for pid in migrant_ids if "test1_migrant_" in pid] + island_1_migrants = [pid for pid in migrant_ids if "test2_migrant_" in pid] + + # test1 should migrate to islands 1 and 2 + self.assertTrue(any("_1" in pid for pid in island_0_migrants)) + self.assertTrue(any("_2" in pid for pid in island_0_migrants)) + + # test2 should migrate to islands 2 and 0 + self.assertTrue(any("_2" in pid for pid in island_1_migrants)) + self.assertTrue(any("_0" in pid for pid in island_1_migrants)) + + def test_migration_rate_respected(self): + """Test that migration rate is properly applied""" + # Add multiple programs to island 0 + programs = [] + for i in range(10): + program = self._create_test_program(f"test{i}", 0.5 + i * 0.05, 0) + programs.append(program) + self.db.add(program, target_island=0) + + # Set up for migration + self.db.island_generations = [6, 6, 6] + + initial_count = len(self.db.programs) + + # Perform migration + self.db.migrate_programs() + + # Calculate expected migrants + # With 50% migration rate and 10 programs, expect 5 migrants + # Each migrant goes to 2 target islands, so 10 total new programs + expected_new_programs = 5 * 2 # 5 migrants * 2 target islands each + actual_new_programs = len(self.db.programs) - initial_count + + self.assertEqual(actual_new_programs, expected_new_programs) + + def test_migration_preserves_best_programs(self): + """Test that migration selects the best programs for migration""" + # Add programs with different scores to island 0 + program1 = self._create_test_program("low_score", 0.2, 0) + program2 = self._create_test_program("high_score", 0.9, 0) + program3 = self._create_test_program("med_score", 0.5, 0) + + self.db.add(program1, target_island=0) + self.db.add(program2, target_island=0) + self.db.add(program3, target_island=0) + + # Set up for migration + self.db.island_generations = [6, 6, 6] + + # Perform migration + self.db.migrate_programs() + + # Check that the high-score program was selected for migration + migrant_ids = [pid for pid in self.db.programs.keys() if "_migrant_" in pid] + high_score_migrants = [pid for pid in migrant_ids if "high_score_migrant_" in pid] + + self.assertGreater(len(high_score_migrants), 0) + + def test_migration_updates_generations(self): + """Test that migration updates the last migration generation""" + # Add a program and set up for migration + program = self._create_test_program("test1", 0.5, 0) + self.db.add(program, target_island=0) + + self.db.island_generations = [6, 7, 8] + initial_migration_gen = self.db.last_migration_generation + + # Perform migration + self.db.migrate_programs() + + # Should update to max of island generations + self.assertEqual(self.db.last_migration_generation, 8) + self.assertGreater(self.db.last_migration_generation, initial_migration_gen) + + def test_migration_with_empty_islands(self): + """Test that migration handles empty islands gracefully""" + # Add program only to island 0, leave others empty + program = self._create_test_program("test1", 0.5, 0) + self.db.add(program, target_island=0) + + # Set up for migration + self.db.island_generations = [6, 6, 6] + + # Should not crash with empty islands + try: + self.db.migrate_programs() + except Exception as e: + self.fail(f"Migration with empty islands should not crash: {e}") + + def test_migration_creates_proper_copies(self): + """Test that migration creates proper program copies""" + program = self._create_test_program("original", 0.7, 0) + self.db.add(program, target_island=0) + + # Set up for migration + self.db.island_generations = [6, 6, 6] + + # Perform migration + self.db.migrate_programs() + + # Find migrant copies + migrant_ids = [pid for pid in self.db.programs.keys() if "original_migrant_" in pid] + self.assertGreater(len(migrant_ids), 0) + + # Check migrant properties + for migrant_id in migrant_ids: + migrant = self.db.programs[migrant_id] + + # Should have same code and metrics + self.assertEqual(migrant.code, program.code) + self.assertEqual(migrant.metrics, program.metrics) + + # Should have proper parent reference + self.assertEqual(migrant.parent_id, "original") + + # Should be marked as migrant + self.assertTrue(migrant.metadata.get("migrant", False)) + + # Should be in correct target island + target_island = migrant.metadata["island"] + self.assertIn(migrant_id, self.db.islands[target_island]) + + def test_no_migration_with_single_island(self): + """Test that migration is skipped with single island""" + # Create database with single island + config = Config() + config.database.in_memory = True + config.database.num_islands = 1 + single_island_db = ProgramDatabase(config.database) + + program = self._create_test_program("test1", 0.5, 0) + single_island_db.add(program, target_island=0) + + single_island_db.island_generations = [6] + + initial_count = len(single_island_db.programs) + + # Should not perform migration + single_island_db.migrate_programs() + + # Program count should remain the same + self.assertEqual(len(single_island_db.programs), initial_count) + + +if __name__ == "__main__": + unittest.main() \ No newline at end of file diff --git a/tests/test_island_tracking.py b/tests/test_island_tracking.py new file mode 100644 index 000000000..28723da1f --- /dev/null +++ b/tests/test_island_tracking.py @@ -0,0 +1,266 @@ +""" +Tests for island best program tracking functionality in openevolve.database +""" + +import unittest +from openevolve.config import Config +from openevolve.database import Program, ProgramDatabase + + +class TestIslandTracking(unittest.TestCase): + """Tests for island best program tracking in program database""" + + def setUp(self): + """Set up test database with multiple islands""" + config = Config() + config.database.in_memory = True + config.database.num_islands = 3 + self.db = ProgramDatabase(config.database) + + def _create_test_program(self, program_id: str, score: float, island: int) -> Program: + """Helper to create a test program""" + program = Program( + id=program_id, + code=f"def func_{program_id}(): return {score}", + language="python", + metrics={"score": score, "combined_score": score}, + metadata={"island": island} + ) + return program + + def test_initial_island_best_tracking(self): + """Test initial state of island best program tracking""" + # Initially all island best programs should be None + self.assertEqual(len(self.db.island_best_programs), 3) + for best_id in self.db.island_best_programs: + self.assertIsNone(best_id) + + def test_first_program_becomes_island_best(self): + """Test that the first program added to an island becomes the best""" + program = self._create_test_program("first", 0.5, 0) + self.db.add(program, target_island=0) + + # Should become the best program for island 0 + self.assertEqual(self.db.island_best_programs[0], "first") + + # Other islands should still have None + self.assertIsNone(self.db.island_best_programs[1]) + self.assertIsNone(self.db.island_best_programs[2]) + + def test_better_program_updates_island_best(self): + """Test that a better program replaces the island best""" + # Add initial program + program1 = self._create_test_program("mediocre", 0.5, 0) + self.db.add(program1, target_island=0) + self.assertEqual(self.db.island_best_programs[0], "mediocre") + + # Add better program + program2 = self._create_test_program("better", 0.8, 0) + self.db.add(program2, target_island=0) + self.assertEqual(self.db.island_best_programs[0], "better") + + def test_worse_program_does_not_update_island_best(self): + """Test that a worse program does not replace the island best""" + # Add good program + program1 = self._create_test_program("good", 0.8, 0) + self.db.add(program1, target_island=0) + self.assertEqual(self.db.island_best_programs[0], "good") + + # Add worse program + program2 = self._create_test_program("worse", 0.3, 0) + self.db.add(program2, target_island=0) + + # Should still be the good program + self.assertEqual(self.db.island_best_programs[0], "good") + + def test_island_isolation_in_best_tracking(self): + """Test that island best tracking is isolated between islands""" + # Add programs to different islands + program1 = self._create_test_program("island0_best", 0.9, 0) + program2 = self._create_test_program("island1_best", 0.7, 1) + program3 = self._create_test_program("island2_best", 0.5, 2) + + self.db.add(program1, target_island=0) + self.db.add(program2, target_island=1) + self.db.add(program3, target_island=2) + + # Each island should track its own best + self.assertEqual(self.db.island_best_programs[0], "island0_best") + self.assertEqual(self.db.island_best_programs[1], "island1_best") + self.assertEqual(self.db.island_best_programs[2], "island2_best") + + def test_migration_updates_island_best(self): + """Test that migration can update island best programs""" + # Add program to island 0 + original = self._create_test_program("original", 0.6, 0) + self.db.add(original, target_island=0) + + # Island 1 starts empty + self.assertIsNone(self.db.island_best_programs[1]) + + # Manually create a migrant to island 1 (simulating migration) + migrant = Program( + id="original_migrant_1", + code=original.code, + language=original.language, + parent_id=original.id, + generation=original.generation, + metrics=original.metrics.copy(), + metadata={"island": 1, "migrant": True} + ) + + # Add migrant to island 1 + self.db.add(migrant, target_island=1) + + # Should become best for island 1 + self.assertEqual(self.db.island_best_programs[1], "original_migrant_1") + + def test_get_top_programs_island_specific(self): + """Test getting top programs from a specific island""" + # Add programs to island 0 + program1 = self._create_test_program("prog1", 0.9, 0) + program2 = self._create_test_program("prog2", 0.7, 0) + program3 = self._create_test_program("prog3", 0.5, 0) + + # Add programs to island 1 + program4 = self._create_test_program("prog4", 0.8, 1) + program5 = self._create_test_program("prog5", 0.6, 1) + + self.db.add(program1, target_island=0) + self.db.add(program2, target_island=0) + self.db.add(program3, target_island=0) + self.db.add(program4, target_island=1) + self.db.add(program5, target_island=1) + + # Get top programs from island 0 + island0_top = self.db.get_top_programs(n=2, island_idx=0) + self.assertEqual(len(island0_top), 2) + self.assertEqual(island0_top[0].id, "prog1") # Highest score + self.assertEqual(island0_top[1].id, "prog2") # Second highest + + # Get top programs from island 1 + island1_top = self.db.get_top_programs(n=2, island_idx=1) + self.assertEqual(len(island1_top), 2) + self.assertEqual(island1_top[0].id, "prog4") # Highest score in island 1 + self.assertEqual(island1_top[1].id, "prog5") # Second highest in island 1 + + def test_island_best_with_combined_score(self): + """Test island best tracking with combined_score metric""" + # Add programs with combined_score + program1 = Program( + id="test1", + code="def test1(): pass", + language="python", + metrics={"score": 0.5, "other": 0.3, "combined_score": 0.4}, + metadata={"island": 0} + ) + + program2 = Program( + id="test2", + code="def test2(): pass", + language="python", + metrics={"score": 0.3, "other": 0.7, "combined_score": 0.5}, + metadata={"island": 0} + ) + + self.db.add(program1, target_island=0) + self.assertEqual(self.db.island_best_programs[0], "test1") + + # program2 has higher combined_score, should become best + self.db.add(program2, target_island=0) + self.assertEqual(self.db.island_best_programs[0], "test2") + + def test_island_best_with_missing_program(self): + """Test island best tracking when best program is removed""" + program = self._create_test_program("to_remove", 0.8, 0) + self.db.add(program, target_island=0) + self.assertEqual(self.db.island_best_programs[0], "to_remove") + + # Manually remove the program (simulating cleanup) + del self.db.programs["to_remove"] + self.db.islands[0].remove("to_remove") + + # Add a new program - should detect stale reference and update + new_program = self._create_test_program("new", 0.6, 0) + self.db.add(new_program, target_island=0) + + # Should update the best program (the old one is gone) + self.assertEqual(self.db.island_best_programs[0], "new") + + def test_sample_inspirations_from_island(self): + """Test that inspiration sampling respects island boundaries""" + # Add programs to island 0 + program1 = self._create_test_program("island0_prog1", 0.9, 0) + program2 = self._create_test_program("island0_prog2", 0.7, 0) + + # Add programs to island 1 + program3 = self._create_test_program("island1_prog1", 0.8, 1) + program4 = self._create_test_program("island1_prog2", 0.6, 1) + + self.db.add(program1, target_island=0) + self.db.add(program2, target_island=0) + self.db.add(program3, target_island=1) + self.db.add(program4, target_island=1) + + # Sample from island 0 program + inspirations = self.db._sample_inspirations(program1, n=5) + + # All inspirations should be from island 0 + for inspiration in inspirations: + island = inspiration.metadata.get("island") + self.assertEqual(island, 0, f"Program {inspiration.id} should be from island 0, got {island}") + + def test_island_status_logging(self): + """Test island status logging functionality""" + # Add programs to different islands + program1 = self._create_test_program("p1", 0.9, 0) + program2 = self._create_test_program("p2", 0.7, 1) + + self.db.add(program1, target_island=0) + self.db.add(program2, target_island=1) + + # Should not crash when logging status + try: + self.db.log_island_status() + except Exception as e: + self.fail(f"Island status logging should not crash: {e}") + + def test_island_best_persistence(self): + """Test that island best programs are maintained across operations""" + # Add programs to islands + program1 = self._create_test_program("best0", 0.9, 0) + program2 = self._create_test_program("best1", 0.8, 1) + + self.db.add(program1, target_island=0) + self.db.add(program2, target_island=1) + + # Verify initial state + self.assertEqual(self.db.island_best_programs[0], "best0") + self.assertEqual(self.db.island_best_programs[1], "best1") + + # Add more programs that are not better + program3 = self._create_test_program("worse0", 0.5, 0) + program4 = self._create_test_program("worse1", 0.4, 1) + + self.db.add(program3, target_island=0) + self.db.add(program4, target_island=1) + + # Best should remain unchanged + self.assertEqual(self.db.island_best_programs[0], "best0") + self.assertEqual(self.db.island_best_programs[1], "best1") + + def test_invalid_island_index_handling(self): + """Test handling of invalid island indices""" + # Test with island index out of bounds + with self.assertRaises(IndexError): + self.db.get_top_programs(n=5, island_idx=10) + + def test_empty_island_top_programs(self): + """Test getting top programs from empty island""" + # Island 0 is empty initially + top_programs = self.db.get_top_programs(n=5, island_idx=0) + self.assertEqual(len(top_programs), 0) + + +if __name__ == "__main__": + unittest.main() \ No newline at end of file From a4a38473cacde079fb5c9a68596e78d202147479 Mon Sep 17 00:00:00 2001 From: Asankhaya Sharma Date: Fri, 11 Jul 2025 11:14:36 +0800 Subject: [PATCH 4/4] Bump version to 0.0.15 Updated version number in pyproject.toml and setup.py to prepare for the next release. --- pyproject.toml | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index abe90c44f..cc41df178 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "openevolve" -version = "0.0.14" +version = "0.0.15" description = "Open-source implementation of AlphaEvolve" readme = "README.md" requires-python = ">=3.9" diff --git a/setup.py b/setup.py index e876b1c90..4db6920e8 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ setup( name="openevolve", - version="0.0.14", + version="0.0.15", packages=find_packages(), include_package_data=True, )