diff --git a/DIRECTORY.md b/DIRECTORY.md index 0f9859577493..f759b7f19da3 100644 --- a/DIRECTORY.md +++ b/DIRECTORY.md @@ -461,6 +461,7 @@ ## Genetic Algorithm * [Basic String](genetic_algorithm/basic_string.py) + * [Knapsack](genetic_algorithm/knapsack.py) ## Geodesy * [Haversine Distance](geodesy/haversine_distance.py) diff --git a/genetic_algorithm/knapsack.py b/genetic_algorithm/knapsack.py new file mode 100644 index 000000000000..270dc0c9acfd --- /dev/null +++ b/genetic_algorithm/knapsack.py @@ -0,0 +1,345 @@ +"""Did you know that Genetic Algorithms can be used to quickly approximate +combinatorial optimization problems such as knapsack? + +It is commonly known that combinatorial optimization problems can be solved using +dynamic programming. It is lesser known that genetic algorithms (or evolutionary +computing in general) can reach the best solution fairly quickly in a lot of cases. +Otherwise, it can still approximate a very good solution (in life, good is good +enough). + +Genetic algorithms: https://en.wikipedia.org/wiki/Genetic_algorithm +Evolutionary computation: https://en.wikipedia.org/wiki/Evolutionary_computation +Knapsack problem: https://en.wikipedia.org/wiki/Knapsack_problem + +Run doctests: + python -m doctest -v knapsack.py +""" + +import random +from dataclasses import dataclass + +random.seed(42) + +# =========================== Problem setup: Knapsack =========================== + +KNAPSACK_N_ITEMS: int = 42 # Number of items in the knapsack problem +KNAPSACK_VALUE_RANGE: tuple[int, int] = (10, 100) # Range of item values +KNAPSACK_WEIGHT_RANGE: tuple[int, int] = (5, 50) # Range of item weights +KNAPSACK_CAPACITY_RATIO: float = 0.5 # Capacity as a fraction of total weight + + +@dataclass +class Item: + value: int + weight: int + + +def generate_knapsack_instance( + n_items: int, + value_range: tuple[int, int], + weight_range: tuple[int, int], + capacity_ratio: float, +) -> tuple[list[Item], int]: + """ + Generates a random knapsack problem instance. + + Returns a tuple: (items, capacity), where items is a list of Item(value, weight) + and capacity is an int computed as floor(capacity_ratio * total_weight). + + Examples + -------- + Use a tiny, deterministic instance to validate shape and capacity range: + + >>> random.seed(0) + >>> items, cap = generate_knapsack_instance( + ... n_items=3, + ... value_range=(5, 5), + ... weight_range=(10, 10), + ... capacity_ratio=0.5 + ... ) + >>> len(items), cap + (3, 15) + >>> all(isinstance(it, Item) for it in items) + True + >>> [it.value for it in items], [it.weight for it in items] + ([5, 5, 5], [10, 10, 10]) + """ + items = [] + for _ in range(n_items): + value = random.randint(*value_range) + weight = random.randint(*weight_range) + items.append(Item(value=value, weight=weight)) + # Capacity as a fraction of total weight + capacity = int(sum(it.weight for it in items) * capacity_ratio) + return items, capacity + + +# Example instance (guarded by __main__ below for printing) +items, capacity = generate_knapsack_instance( + n_items=KNAPSACK_N_ITEMS, + value_range=KNAPSACK_VALUE_RANGE, + weight_range=KNAPSACK_WEIGHT_RANGE, + capacity_ratio=KNAPSACK_CAPACITY_RATIO, +) + +# ============================== GA Representation ============================== + +# HYPERPARAMETERS (For tuning the GA) + +POPULATION_SIZE = 120 +GENERATIONS = 200 +CROSSOVER_PROBABILITY = 0.9 +MUTATION_PROBABILITY = 0.01 +TOURNAMENT_K = 3 +ELITISM = 2 + +OVERWEIGHT_PENALTY_FACTOR = 10 + +genome_t = list[int] # An index list where 1 means item is included, 0 means excluded + + +def evaluate(genome: genome_t, items: list[Item], capacity: int) -> tuple[int, int]: + """ + Calculates fitness (value) and weight of a candidate solution. If overweight, + the returned value is penalized; weight is the actual summed weight. + + Returns (value, weight). + + Examples + -------- + Feasible genome (no penalty): + + >>> it = [Item(10, 4), Item(7, 3), Item(5, 2)] + >>> genome = [1, 0, 1] # take items 0 and 2 + >>> evaluate(genome, it, capacity=7) + (15, 6) + + Overweight genome (penalty applies): + Total value = 10+7+5 = 22, total weight = 9, capacity = 7, overflow = 2 + Penalized value = max(0, 22 - 2 * OVERWEIGHT_PENALTY_FACTOR) = 2 + + >>> genome = [1, 1, 1] + >>> evaluate(genome, it, capacity=7) + (2, 9) + """ + total_value = 0 + total_weight = 0 + for gene, item in zip(genome, items): + if gene: + total_value += item.value + total_weight += item.weight + if total_weight > capacity: + overflow = total_weight - capacity + total_value = max(0, total_value - overflow * OVERWEIGHT_PENALTY_FACTOR) + return total_value, total_weight + + +def random_genome(length: int) -> genome_t: + """ + Generates a random genome (list of 0/1) of length n. + + Examples + -------- + Check length and content are 0/1 bits: + + >>> random.seed(123) + >>> g = random_genome(5) + >>> len(g), set(g).issubset({0, 1}) + (5, True) + """ + return [random.randint(0, 1) for _ in range(length)] + + +def selection( + population: list[genome_t], fitnesses: list[int], tournament_k: int +) -> genome_t: + """ + Performs tournament selection to choose a genome from the population. + + Note that other selection strategies exist such as roulette wheel, rank-based, etc. + + Examples + -------- + Deterministic tournament with fixed seed (k=2): + + >>> random.seed(1) + >>> pop = [[0,0,0], [1,0,0], [1,1,0], [1,1,1]] + >>> fits = [0, 5, 9, 7] + >>> parent = selection(pop, fits, tournament_k=2) + >>> parent in pop + True + """ + contenders = random.sample(list(zip(population, fitnesses)), tournament_k) + return max(contenders, key=lambda contender: contender[1])[0][:] + + +def crossover( + genome_1: genome_t, genome_2: genome_t, p_crossover: float +) -> tuple[genome_t, genome_t]: + """ + Performs single-point crossover between two genomes. + If crossover does not occur (random > p_crossover) or genomes are too short, + returns copies of the parents. + + Note: other crossover strategies exist (two-point, uniform, etc.). + + Examples + -------- + Force crossover with p=1.0 and fixed RNG; verify lengths and bit content: + + >>> random.seed(2) + >>> a, b = [0,0,0,0], [1,1,1,1] + >>> c1, c2 = crossover(a, b, p_crossover=1.0) + >>> len(c1) == len(a) == len(c2) == len(b) + True + >>> set(c1).issubset({0,1}) and set(c2).issubset({0,1}) + True + + No crossover if p=0.0: + + >>> c1, c2 = crossover([0,0,0], [1,1,1], p_crossover=0.0) + >>> c1, c2 + ([0, 0, 0], [1, 1, 1]) + """ + min_length = min(len(genome_1), len(genome_2)) + if random.random() > p_crossover or min_length < 2: + return genome_1[:], genome_2[:] + cutoff_point = random.randint(1, min_length - 1) + return genome_1[:cutoff_point] + genome_2[cutoff_point:], genome_2[ + :cutoff_point + ] + genome_1[cutoff_point:] + + +def mutation(genome: genome_t, p_mutation: float) -> genome_t: + """ + Performs bit-flip mutation on a genome. Each bit flips with probability p_mutation. + + Note: other mutation strategies exist (swap, scramble, etc.). + + Examples + -------- + With probability 1.0, every bit flips: + + >>> mutation([0, 1, 1, 0], p_mutation=1.0) + [1, 0, 0, 1] + + With probability 0.0, nothing changes: + + >>> mutation([0, 1, 1, 0], p_mutation=0.0) + [0, 1, 1, 0] + """ + return [(1 - gene) if random.random() < p_mutation else gene for gene in genome] + + +def run_ga( + items: list[Item], + capacity: int, + pop_size: int = POPULATION_SIZE, + generations: int = GENERATIONS, + p_crossover: float = CROSSOVER_PROBABILITY, + p_mutation: float = MUTATION_PROBABILITY, + tournament_k: int = TOURNAMENT_K, + elitism: int = ELITISM, +) -> dict: + """ + Runs the genetic algorithm to (approximately) solve the knapsack problem. + + Returns a dict with keys: + - 'best_genome' (genome_t) + - 'best_value' (int) + - 'best_weight' (int) + - 'capacity' (int) + - 'best_history' (list[int]) + - 'avg_history' (list[float]) + + Examples + -------- + Use a tiny instance and few generations to validate structure and lengths: + + >>> random.seed(1234) + >>> tiny_items = [Item(5,2), Item(6,3), Item(2,1), Item(7,4)] + >>> cap = 5 + >>> out = run_ga( + ... tiny_items, cap, + ... pop_size=10, generations=5, + ... p_crossover=0.9, p_mutation=0.05, + ... tournament_k=2, elitism=1 + ... ) + >>> len(out['best_fitness_history']) == 5 and len(out['avg_fitness_history']) == 5 + True + >>> isinstance(out['best_genome'], list) and isinstance(out['best_value'], int) + True + >>> out['capacity'] == cap + True + """ + population = [random_genome(len(items)) for _ in range(pop_size)] + best_fitness_history: list[int] = [] # track best fitness per generation + avg_fitness_history: list[float] = [] + best_genome_overall: genome_t = [] + best_fitness_overall: int = -1 + + for _ in range(generations): + fitnesses = [evaluate(genome, items, capacity)[0] for genome in population] + best_fit = max(fitnesses) + best_idx = fitnesses.index(best_fit) + best_fitness_history.append(best_fit) + avg_fit = sum(fitnesses) / pop_size + avg_fitness_history.append(avg_fit) + + if best_fit > best_fitness_overall: + best_fitness_overall = best_fit + best_genome_overall = population[best_idx][:] + + # Elitism + sorted_indices = sorted(range(pop_size), key=lambda idx: fitnesses[idx]) + elite_indices = sorted_indices[::-1][:elitism] # reverse and take top indices + elites = [population[idx][:] for idx in elite_indices] + + # New generation + new_pop = elites[:] + while len(new_pop) < pop_size: + parent1 = selection(population, fitnesses, tournament_k=tournament_k) + parent2 = selection(population, fitnesses, tournament_k=tournament_k) + child1, child2 = crossover(parent1, parent2, p_crossover) + child1 = mutation(child1, p_mutation) + child2 = mutation(child2, p_mutation) + new_pop.extend([child1, child2]) + population = new_pop[:pop_size] + + # Final evaluation of the best + best_value, best_weight = evaluate(best_genome_overall, items, capacity) + return { + "best_genome": best_genome_overall, + "best_value": best_value, + "best_weight": best_weight, + "capacity": capacity, + "best_fitness_history": best_fitness_history, + "avg_fitness_history": avg_fitness_history, + } + + +# ================================ Script entry ================================= + +if __name__ == "__main__": + result = run_ga(items, capacity) + best_value, best_weight = result["best_value"], result["best_weight"] + + print(f"Knapsack capacity: {result['capacity']}") + print(f"Best solution: value = {best_value}, weight = {best_weight}") + # # Uncomment to inspect chosen items: + # best_items = [ + # items[idx] for idx, bit in enumerate(result["best_genome"]) if bit == 1 + # ] + # print("Items included in the best solution:", best_items) + + # # Optional: plot fitness curves + # import matplotlib.pyplot as plt + # plt.figure() + # plt.plot(result["best_fitness_history"], label="Best fitness") + # plt.plot(result["avg_fitness_history"], label="Average fitness") + # plt.title("GA on Knapsack: Fitness over Generations") + # plt.xlabel("Generation") + # plt.ylabel("Fitness") + # plt.legend() + # plt.tight_layout() + # plt.show()