From 30605bdacc6294b9af059f81955aeef2d5566a22 Mon Sep 17 00:00:00 2001 From: ai-symphony Date: Thu, 5 Jun 2025 21:25:52 -0700 Subject: [PATCH] Using an approximation for the calculate_edit_distance for scalability --- openevolve/utils/code_utils.py | 36 ++++++++++++++++------------------ 1 file changed, 17 insertions(+), 19 deletions(-) diff --git a/openevolve/utils/code_utils.py b/openevolve/utils/code_utils.py index 60fb63001..8aa594910 100644 --- a/openevolve/utils/code_utils.py +++ b/openevolve/utils/code_utils.py @@ -2,6 +2,7 @@ Utilities for code parsing, diffing, and manipulation """ +import difflib import re from typing import Dict, List, Optional, Tuple, Union @@ -146,38 +147,35 @@ def format_diff_summary(diff_blocks: List[Tuple[str, str]]) -> str: def calculate_edit_distance(code1: str, code2: str) -> int: """ - Calculate the Levenshtein edit distance between two code snippets + Calculate an approximate "edit distance" between two code snippets + using difflib.SequenceMatcher. This is generally faster than a pure + Python Levenshtein implementation and provides an approximation + of the edit distance by scaling the dissimilarity ratio. Args: code1: First code snippet code2: Second code snippet Returns: - Edit distance (number of operations needed to transform code1 into code2) + An approximate integer for the "edit distance". """ if code1 == code2: return 0 - # Simple implementation of Levenshtein distance - m, n = len(code1), len(code2) - dp = [[0 for _ in range(n + 1)] for _ in range(m + 1)] + # SequenceMatcher finds similarity based on common subsequences. + # The ratio() method returns a similarity score between 0.0 and 1.0. + matcher = difflib.SequenceMatcher(None, code1, code2) + similarity_ratio = matcher.ratio() - for i in range(m + 1): - dp[i][0] = i + # Convert similarity to a dissimilarity measure (0.0 to 1.0) + dissimilarity = 1.0 - similarity_ratio - for j in range(n + 1): - dp[0][j] = j + # Scale the dissimilarity by the length of the longer string + # to get an integer "distance" that is somewhat analogous to + # Levenshtein distance. This is an approximation. + approx_distance = round(dissimilarity * max(len(code1), len(code2))) - for i in range(1, m + 1): - for j in range(1, n + 1): - cost = 0 if code1[i - 1] == code2[j - 1] else 1 - dp[i][j] = min( - dp[i - 1][j] + 1, # deletion - dp[i][j - 1] + 1, # insertion - dp[i - 1][j - 1] + cost, # substitution - ) - - return dp[m][n] + return approx_distance def extract_code_language(code: str) -> str: