|
| 1 | +/** |
| 2 | + * @file |
| 3 | + * @brief Implementation of the [Smith-Waterman |
| 4 | + * algorithm](https://en.wikipedia.org/wiki/Smith%E2%80%93Waterman_algorithm) |
| 5 | + * for local sequence alignment |
| 6 | + * |
| 7 | + * @details |
| 8 | + * The Smith-Waterman algorithm is a dynamic programming algorithm for |
| 9 | + * determining similar regions between two sequences (nucleotide or protein |
| 10 | + * sequences). It performs local sequence alignment, which identifies the best |
| 11 | + * matching subsequence rather than aligning entire sequences. |
| 12 | + * |
| 13 | + * The algorithm works by: |
| 14 | + * 1. Creating a scoring matrix where each cell represents the maximum |
| 15 | + * alignment score ending at that position |
| 16 | + * 2. Using match, mismatch, and gap penalties to calculate scores |
| 17 | + * 3. Allowing scores to reset to 0 (ensuring local rather than global |
| 18 | + * alignment) |
| 19 | + * 4. Tracing back from the highest scoring position to reconstruct the |
| 20 | + * alignment |
| 21 | + * |
| 22 | + * ### Algorithm |
| 23 | + * Given two sequences S1 and S2 of lengths m and n: |
| 24 | + * - Initialize a (m+1) × (n+1) matrix H with H[i][0] = H[0][j] = 0 |
| 25 | + * - For each cell H[i][j]: |
| 26 | + * - H[i][j] = max(0, H[i-1][j-1] + match/mismatch_score, |
| 27 | + * H[i-1][j] + gap_penalty, H[i][j-1] + gap_penalty) |
| 28 | + * - Find the maximum value in H matrix |
| 29 | + * - Traceback from maximum value to cell with value 0 |
| 30 | + * |
| 31 | + * Time Complexity: O(m × n) where m and n are the lengths of the sequences |
| 32 | + * Space Complexity: O(m × n) for the scoring matrix |
| 33 | + * |
| 34 | + * @author [Ali Alimohammadi](https://github.com/AliAlimohammadi) |
| 35 | + */ |
| 36 | + |
| 37 | +#include <algorithm> /// for std::max |
| 38 | +#include <cassert> /// for assert |
| 39 | +#include <iostream> /// for IO operations |
| 40 | +#include <string> /// for std::string |
| 41 | +#include <vector> /// for std::vector |
| 42 | + |
| 43 | +/** |
| 44 | + * @namespace dynamic_programming |
| 45 | + * @brief Dynamic Programming algorithms |
| 46 | + */ |
| 47 | +namespace dynamic_programming { |
| 48 | +/** |
| 49 | + * @namespace smith_waterman |
| 50 | + * @brief Functions for the Smith-Waterman algorithm |
| 51 | + */ |
| 52 | +namespace smith_waterman { |
| 53 | + |
| 54 | +/** |
| 55 | + * @brief Calculate the score for a character pair |
| 56 | + * |
| 57 | + * @param a First character |
| 58 | + * @param b Second character |
| 59 | + * @param match_score Score for matching characters (typically positive) |
| 60 | + * @param mismatch_score Score for mismatching characters (typically negative) |
| 61 | + * @param gap_score Penalty for gaps (typically negative) |
| 62 | + * @return int The calculated score |
| 63 | + */ |
| 64 | +int score_function(char a, char b, int match_score, int mismatch_score, |
| 65 | + int gap_score) { |
| 66 | + if (a == '-' || b == '-') { |
| 67 | + return gap_score; |
| 68 | + } else if (a == b) { |
| 69 | + return match_score; |
| 70 | + } else { |
| 71 | + return mismatch_score; |
| 72 | + } |
| 73 | +} |
| 74 | + |
| 75 | +/** |
| 76 | + * @brief Perform Smith-Waterman local sequence alignment |
| 77 | + * |
| 78 | + * @param query First sequence |
| 79 | + * @param subject Second sequence |
| 80 | + * @param match_score Score for matching characters (default: 1) |
| 81 | + * @param mismatch_score Score for mismatching characters (default: -1) |
| 82 | + * @param gap_score Penalty for gaps (default: -2) |
| 83 | + * @return std::vector<std::vector<int>> The scoring matrix |
| 84 | + */ |
| 85 | +std::vector<std::vector<int>> smith_waterman(const std::string& query, |
| 86 | + const std::string& subject, |
| 87 | + int match_score = 1, |
| 88 | + int mismatch_score = -1, |
| 89 | + int gap_score = -2) { |
| 90 | + int m = query.length(); |
| 91 | + int n = subject.length(); |
| 92 | + |
| 93 | + // Initialize scoring matrix with zeros |
| 94 | + std::vector<std::vector<int>> score(m + 1, std::vector<int>(n + 1, 0)); |
| 95 | + |
| 96 | + // Fill the scoring matrix using dynamic programming |
| 97 | + for (int i = 1; i <= m; ++i) { |
| 98 | + for (int j = 1; j <= n; ++j) { |
| 99 | + // Calculate score for match/mismatch |
| 100 | + int match_or_mismatch = score[i - 1][j - 1] + |
| 101 | + score_function(query[i - 1], subject[j - 1], |
| 102 | + match_score, mismatch_score, |
| 103 | + gap_score); |
| 104 | + |
| 105 | + // Calculate score for deletion (gap in subject) |
| 106 | + int delete_score = score[i - 1][j] + gap_score; |
| 107 | + |
| 108 | + // Calculate score for insertion (gap in query) |
| 109 | + int insert_score = score[i][j - 1] + gap_score; |
| 110 | + |
| 111 | + // Take maximum of all options, but never go below 0 (local |
| 112 | + // alignment) |
| 113 | + score[i][j] = |
| 114 | + std::max({0, match_or_mismatch, delete_score, insert_score}); |
| 115 | + } |
| 116 | + } |
| 117 | + |
| 118 | + return score; |
| 119 | +} |
| 120 | + |
| 121 | +/** |
| 122 | + * @brief Perform traceback to reconstruct the optimal alignment |
| 123 | + * |
| 124 | + * @param score The score matrix from smith_waterman function |
| 125 | + * @param query Original query sequence |
| 126 | + * @param subject Original subject sequence |
| 127 | + * @param match_score Score for matching characters (default: 1) |
| 128 | + * @param mismatch_score Score for mismatching characters (default: -1) |
| 129 | + * @param gap_score Penalty for gaps (default: -2) |
| 130 | + * @return std::pair<std::string, std::string> The aligned sequences |
| 131 | + */ |
| 132 | +std::pair<std::string, std::string> traceback( |
| 133 | + const std::vector<std::vector<int>>& score, const std::string& query, |
| 134 | + const std::string& subject, int match_score = 1, int mismatch_score = -1, |
| 135 | + int gap_score = -2) { |
| 136 | + // Find the cell with maximum score |
| 137 | + int max_value = 0; |
| 138 | + int i_max = 0; |
| 139 | + int j_max = 0; |
| 140 | + |
| 141 | + for (size_t i = 0; i < score.size(); ++i) { |
| 142 | + for (size_t j = 0; j < score[i].size(); ++j) { |
| 143 | + if (score[i][j] > max_value) { |
| 144 | + max_value = score[i][j]; |
| 145 | + i_max = i; |
| 146 | + j_max = j; |
| 147 | + } |
| 148 | + } |
| 149 | + } |
| 150 | + |
| 151 | + // If no significant alignment found, return empty strings |
| 152 | + if (i_max == 0 || j_max == 0) { |
| 153 | + return {"", ""}; |
| 154 | + } |
| 155 | + |
| 156 | + // Traceback from the maximum scoring cell |
| 157 | + std::string align1; |
| 158 | + std::string align2; |
| 159 | + int i = i_max; |
| 160 | + int j = j_max; |
| 161 | + |
| 162 | + // Continue tracing back until we hit a cell with score 0 |
| 163 | + while (i > 0 && j > 0 && score[i][j] > 0) { |
| 164 | + int current_score = score[i][j]; |
| 165 | + |
| 166 | + // Check if we came from diagonal (match/mismatch) |
| 167 | + if (current_score == |
| 168 | + score[i - 1][j - 1] + |
| 169 | + score_function(query[i - 1], subject[j - 1], match_score, |
| 170 | + mismatch_score, gap_score)) { |
| 171 | + align1 = query[i - 1] + align1; |
| 172 | + align2 = subject[j - 1] + align2; |
| 173 | + --i; |
| 174 | + --j; |
| 175 | + } |
| 176 | + // Check if we came from above (deletion/gap in subject) |
| 177 | + else if (current_score == score[i - 1][j] + gap_score) { |
| 178 | + align1 = query[i - 1] + align1; |
| 179 | + align2 = '-' + align2; |
| 180 | + --i; |
| 181 | + } |
| 182 | + // Otherwise we came from left (insertion/gap in query) |
| 183 | + else { |
| 184 | + align1 = '-' + align1; |
| 185 | + align2 = subject[j - 1] + align2; |
| 186 | + --j; |
| 187 | + } |
| 188 | + } |
| 189 | + |
| 190 | + return {align1, align2}; |
| 191 | +} |
| 192 | + |
| 193 | +} // namespace smith_waterman |
| 194 | +} // namespace dynamic_programming |
| 195 | + |
| 196 | +/** |
| 197 | + * @brief Self-test implementations |
| 198 | + * @returns void |
| 199 | + */ |
| 200 | +static void test() { |
| 201 | + // Test 1: Simple exact match |
| 202 | + auto score1 = dynamic_programming::smith_waterman::smith_waterman("AGT", "AGT"); |
| 203 | + assert(score1[3][3] == 3); // Perfect match should score 3 |
| 204 | + std::cout << "Test 1 passed: Simple exact match\n"; |
| 205 | + |
| 206 | + // Test 2: Partial match |
| 207 | + auto score2 = dynamic_programming::smith_waterman::smith_waterman("ACAC", "CA"); |
| 208 | + assert(score2[3][2] == 2); // Best local alignment scores 2 |
| 209 | + std::cout << "Test 2 passed: Partial match\n"; |
| 210 | + |
| 211 | + // Test 3: Traceback test |
| 212 | + auto result3 = dynamic_programming::smith_waterman::traceback(score2, "ACAC", "CA"); |
| 213 | + assert(result3.first == "CA"); |
| 214 | + assert(result3.second == "CA"); |
| 215 | + std::cout << "Test 3 passed: Traceback\n"; |
| 216 | + |
| 217 | + // Test 4: No match |
| 218 | + auto score4 = dynamic_programming::smith_waterman::smith_waterman("AAA", "TTT"); |
| 219 | + int max_score = 0; |
| 220 | + for (const auto& row : score4) { |
| 221 | + max_score = std::max(max_score, *std::max_element(row.begin(), row.end())); |
| 222 | + } |
| 223 | + assert(max_score == 0); // No matches should score 0 |
| 224 | + std::cout << "Test 4 passed: No match\n"; |
| 225 | + |
| 226 | + // Test 5: Empty strings |
| 227 | + auto score5 = dynamic_programming::smith_waterman::smith_waterman("", "AGT"); |
| 228 | + assert(score5.size() == 1); |
| 229 | + assert(score5[0].size() == 4); |
| 230 | + std::cout << "Test 5 passed: Empty query\n"; |
| 231 | + |
| 232 | + // Test 6: Longer sequences with gaps |
| 233 | + auto score6 = dynamic_programming::smith_waterman::smith_waterman("AGCT", "AGT"); |
| 234 | + auto result6 = dynamic_programming::smith_waterman::traceback(score6, "AGCT", "AGT"); |
| 235 | + // The alignment should handle the gap appropriately |
| 236 | + assert(!result6.first.empty()); |
| 237 | + assert(!result6.second.empty()); |
| 238 | + std::cout << "Test 6 passed: Longer sequences with gaps\n"; |
| 239 | + |
| 240 | + // Test 7: Custom scoring |
| 241 | + auto score7 = dynamic_programming::smith_waterman::smith_waterman( |
| 242 | + "ACGT", "ACGT", 2, -1, -1); // Higher match score |
| 243 | + assert(score7[4][4] == 8); // 4 matches × 2 = 8 |
| 244 | + std::cout << "Test 7 passed: Custom scoring\n"; |
| 245 | + |
| 246 | + // Test 8: Case sensitivity (algorithm is case-sensitive) |
| 247 | + auto score8 = dynamic_programming::smith_waterman::smith_waterman("ACT", "act"); |
| 248 | + int max_score8 = 0; |
| 249 | + for (const auto& row : score8) { |
| 250 | + max_score8 = std::max(max_score8, *std::max_element(row.begin(), row.end())); |
| 251 | + } |
| 252 | + assert(max_score8 == 0); // Different cases, no match |
| 253 | + std::cout << "Test 8 passed: Case sensitivity\n"; |
| 254 | + |
| 255 | + std::cout << "All tests passed successfully!\n"; |
| 256 | +} |
| 257 | + |
| 258 | +/** |
| 259 | + * @brief Main function - only enabled for standalone testing |
| 260 | + * @returns 0 on exit |
| 261 | + */ |
| 262 | +int main() { |
| 263 | + test(); // Run self-test implementations |
| 264 | + |
| 265 | + // Example usage |
| 266 | + std::cout << "\n--- Example Usage ---\n"; |
| 267 | + std::string query = "ACACACTA"; |
| 268 | + std::string subject = "AGCACACA"; |
| 269 | + |
| 270 | + std::cout << "Query: " << query << "\n"; |
| 271 | + std::cout << "Subject: " << subject << "\n\n"; |
| 272 | + |
| 273 | + // Perform alignment |
| 274 | + auto score_matrix = |
| 275 | + dynamic_programming::smith_waterman::smith_waterman(query, subject); |
| 276 | + auto alignment = |
| 277 | + dynamic_programming::smith_waterman::traceback(score_matrix, query, subject); |
| 278 | + |
| 279 | + std::cout << "Aligned sequences:\n"; |
| 280 | + std::cout << "Query: " << alignment.first << "\n"; |
| 281 | + std::cout << "Subject: " << alignment.second << "\n"; |
| 282 | + |
| 283 | + // Find the maximum score |
| 284 | + int max_score = 0; |
| 285 | + for (const auto& row : score_matrix) { |
| 286 | + max_score = std::max(max_score, *std::max_element(row.begin(), row.end())); |
| 287 | + } |
| 288 | + std::cout << "Alignment score: " << max_score << "\n"; |
| 289 | + |
| 290 | + return 0; |
| 291 | +} |
0 commit comments