Skip to content

Commit 5abb748

Browse files
feat: add Smith-Waterman algorithm for local sequence alignment
1 parent b9c118f commit 5abb748

File tree

1 file changed

+291
-0
lines changed

1 file changed

+291
-0
lines changed
Lines changed: 291 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,291 @@
1+
/**
2+
* @file
3+
* @brief Implementation of the [Smith-Waterman
4+
* algorithm](https://en.wikipedia.org/wiki/Smith%E2%80%93Waterman_algorithm)
5+
* for local sequence alignment
6+
*
7+
* @details
8+
* The Smith-Waterman algorithm is a dynamic programming algorithm for
9+
* determining similar regions between two sequences (nucleotide or protein
10+
* sequences). It performs local sequence alignment, which identifies the best
11+
* matching subsequence rather than aligning entire sequences.
12+
*
13+
* The algorithm works by:
14+
* 1. Creating a scoring matrix where each cell represents the maximum
15+
* alignment score ending at that position
16+
* 2. Using match, mismatch, and gap penalties to calculate scores
17+
* 3. Allowing scores to reset to 0 (ensuring local rather than global
18+
* alignment)
19+
* 4. Tracing back from the highest scoring position to reconstruct the
20+
* alignment
21+
*
22+
* ### Algorithm
23+
* Given two sequences S1 and S2 of lengths m and n:
24+
* - Initialize a (m+1) × (n+1) matrix H with H[i][0] = H[0][j] = 0
25+
* - For each cell H[i][j]:
26+
* - H[i][j] = max(0, H[i-1][j-1] + match/mismatch_score,
27+
* H[i-1][j] + gap_penalty, H[i][j-1] + gap_penalty)
28+
* - Find the maximum value in H matrix
29+
* - Traceback from maximum value to cell with value 0
30+
*
31+
* Time Complexity: O(m × n) where m and n are the lengths of the sequences
32+
* Space Complexity: O(m × n) for the scoring matrix
33+
*
34+
* @author [Ali Alimohammadi](https://github.com/AliAlimohammadi)
35+
*/
36+
37+
#include <algorithm> /// for std::max
38+
#include <cassert> /// for assert
39+
#include <iostream> /// for IO operations
40+
#include <string> /// for std::string
41+
#include <vector> /// for std::vector
42+
43+
/**
44+
* @namespace dynamic_programming
45+
* @brief Dynamic Programming algorithms
46+
*/
47+
namespace dynamic_programming {
48+
/**
49+
* @namespace smith_waterman
50+
* @brief Functions for the Smith-Waterman algorithm
51+
*/
52+
namespace smith_waterman {
53+
54+
/**
55+
* @brief Calculate the score for a character pair
56+
*
57+
* @param a First character
58+
* @param b Second character
59+
* @param match_score Score for matching characters (typically positive)
60+
* @param mismatch_score Score for mismatching characters (typically negative)
61+
* @param gap_score Penalty for gaps (typically negative)
62+
* @return int The calculated score
63+
*/
64+
int score_function(char a, char b, int match_score, int mismatch_score,
65+
int gap_score) {
66+
if (a == '-' || b == '-') {
67+
return gap_score;
68+
} else if (a == b) {
69+
return match_score;
70+
} else {
71+
return mismatch_score;
72+
}
73+
}
74+
75+
/**
76+
* @brief Perform Smith-Waterman local sequence alignment
77+
*
78+
* @param query First sequence
79+
* @param subject Second sequence
80+
* @param match_score Score for matching characters (default: 1)
81+
* @param mismatch_score Score for mismatching characters (default: -1)
82+
* @param gap_score Penalty for gaps (default: -2)
83+
* @return std::vector<std::vector<int>> The scoring matrix
84+
*/
85+
std::vector<std::vector<int>> smith_waterman(const std::string& query,
86+
const std::string& subject,
87+
int match_score = 1,
88+
int mismatch_score = -1,
89+
int gap_score = -2) {
90+
int m = query.length();
91+
int n = subject.length();
92+
93+
// Initialize scoring matrix with zeros
94+
std::vector<std::vector<int>> score(m + 1, std::vector<int>(n + 1, 0));
95+
96+
// Fill the scoring matrix using dynamic programming
97+
for (int i = 1; i <= m; ++i) {
98+
for (int j = 1; j <= n; ++j) {
99+
// Calculate score for match/mismatch
100+
int match_or_mismatch = score[i - 1][j - 1] +
101+
score_function(query[i - 1], subject[j - 1],
102+
match_score, mismatch_score,
103+
gap_score);
104+
105+
// Calculate score for deletion (gap in subject)
106+
int delete_score = score[i - 1][j] + gap_score;
107+
108+
// Calculate score for insertion (gap in query)
109+
int insert_score = score[i][j - 1] + gap_score;
110+
111+
// Take maximum of all options, but never go below 0 (local
112+
// alignment)
113+
score[i][j] =
114+
std::max({0, match_or_mismatch, delete_score, insert_score});
115+
}
116+
}
117+
118+
return score;
119+
}
120+
121+
/**
122+
* @brief Perform traceback to reconstruct the optimal alignment
123+
*
124+
* @param score The score matrix from smith_waterman function
125+
* @param query Original query sequence
126+
* @param subject Original subject sequence
127+
* @param match_score Score for matching characters (default: 1)
128+
* @param mismatch_score Score for mismatching characters (default: -1)
129+
* @param gap_score Penalty for gaps (default: -2)
130+
* @return std::pair<std::string, std::string> The aligned sequences
131+
*/
132+
std::pair<std::string, std::string> traceback(
133+
const std::vector<std::vector<int>>& score, const std::string& query,
134+
const std::string& subject, int match_score = 1, int mismatch_score = -1,
135+
int gap_score = -2) {
136+
// Find the cell with maximum score
137+
int max_value = 0;
138+
int i_max = 0;
139+
int j_max = 0;
140+
141+
for (size_t i = 0; i < score.size(); ++i) {
142+
for (size_t j = 0; j < score[i].size(); ++j) {
143+
if (score[i][j] > max_value) {
144+
max_value = score[i][j];
145+
i_max = i;
146+
j_max = j;
147+
}
148+
}
149+
}
150+
151+
// If no significant alignment found, return empty strings
152+
if (i_max == 0 || j_max == 0) {
153+
return {"", ""};
154+
}
155+
156+
// Traceback from the maximum scoring cell
157+
std::string align1;
158+
std::string align2;
159+
int i = i_max;
160+
int j = j_max;
161+
162+
// Continue tracing back until we hit a cell with score 0
163+
while (i > 0 && j > 0 && score[i][j] > 0) {
164+
int current_score = score[i][j];
165+
166+
// Check if we came from diagonal (match/mismatch)
167+
if (current_score ==
168+
score[i - 1][j - 1] +
169+
score_function(query[i - 1], subject[j - 1], match_score,
170+
mismatch_score, gap_score)) {
171+
align1 = query[i - 1] + align1;
172+
align2 = subject[j - 1] + align2;
173+
--i;
174+
--j;
175+
}
176+
// Check if we came from above (deletion/gap in subject)
177+
else if (current_score == score[i - 1][j] + gap_score) {
178+
align1 = query[i - 1] + align1;
179+
align2 = '-' + align2;
180+
--i;
181+
}
182+
// Otherwise we came from left (insertion/gap in query)
183+
else {
184+
align1 = '-' + align1;
185+
align2 = subject[j - 1] + align2;
186+
--j;
187+
}
188+
}
189+
190+
return {align1, align2};
191+
}
192+
193+
} // namespace smith_waterman
194+
} // namespace dynamic_programming
195+
196+
/**
197+
* @brief Self-test implementations
198+
* @returns void
199+
*/
200+
static void test() {
201+
// Test 1: Simple exact match
202+
auto score1 = dynamic_programming::smith_waterman::smith_waterman("AGT", "AGT");
203+
assert(score1[3][3] == 3); // Perfect match should score 3
204+
std::cout << "Test 1 passed: Simple exact match\n";
205+
206+
// Test 2: Partial match
207+
auto score2 = dynamic_programming::smith_waterman::smith_waterman("ACAC", "CA");
208+
assert(score2[3][2] == 2); // Best local alignment scores 2
209+
std::cout << "Test 2 passed: Partial match\n";
210+
211+
// Test 3: Traceback test
212+
auto result3 = dynamic_programming::smith_waterman::traceback(score2, "ACAC", "CA");
213+
assert(result3.first == "CA");
214+
assert(result3.second == "CA");
215+
std::cout << "Test 3 passed: Traceback\n";
216+
217+
// Test 4: No match
218+
auto score4 = dynamic_programming::smith_waterman::smith_waterman("AAA", "TTT");
219+
int max_score = 0;
220+
for (const auto& row : score4) {
221+
max_score = std::max(max_score, *std::max_element(row.begin(), row.end()));
222+
}
223+
assert(max_score == 0); // No matches should score 0
224+
std::cout << "Test 4 passed: No match\n";
225+
226+
// Test 5: Empty strings
227+
auto score5 = dynamic_programming::smith_waterman::smith_waterman("", "AGT");
228+
assert(score5.size() == 1);
229+
assert(score5[0].size() == 4);
230+
std::cout << "Test 5 passed: Empty query\n";
231+
232+
// Test 6: Longer sequences with gaps
233+
auto score6 = dynamic_programming::smith_waterman::smith_waterman("AGCT", "AGT");
234+
auto result6 = dynamic_programming::smith_waterman::traceback(score6, "AGCT", "AGT");
235+
// The alignment should handle the gap appropriately
236+
assert(!result6.first.empty());
237+
assert(!result6.second.empty());
238+
std::cout << "Test 6 passed: Longer sequences with gaps\n";
239+
240+
// Test 7: Custom scoring
241+
auto score7 = dynamic_programming::smith_waterman::smith_waterman(
242+
"ACGT", "ACGT", 2, -1, -1); // Higher match score
243+
assert(score7[4][4] == 8); // 4 matches × 2 = 8
244+
std::cout << "Test 7 passed: Custom scoring\n";
245+
246+
// Test 8: Case sensitivity (algorithm is case-sensitive)
247+
auto score8 = dynamic_programming::smith_waterman::smith_waterman("ACT", "act");
248+
int max_score8 = 0;
249+
for (const auto& row : score8) {
250+
max_score8 = std::max(max_score8, *std::max_element(row.begin(), row.end()));
251+
}
252+
assert(max_score8 == 0); // Different cases, no match
253+
std::cout << "Test 8 passed: Case sensitivity\n";
254+
255+
std::cout << "All tests passed successfully!\n";
256+
}
257+
258+
/**
259+
* @brief Main function - only enabled for standalone testing
260+
* @returns 0 on exit
261+
*/
262+
int main() {
263+
test(); // Run self-test implementations
264+
265+
// Example usage
266+
std::cout << "\n--- Example Usage ---\n";
267+
std::string query = "ACACACTA";
268+
std::string subject = "AGCACACA";
269+
270+
std::cout << "Query: " << query << "\n";
271+
std::cout << "Subject: " << subject << "\n\n";
272+
273+
// Perform alignment
274+
auto score_matrix =
275+
dynamic_programming::smith_waterman::smith_waterman(query, subject);
276+
auto alignment =
277+
dynamic_programming::smith_waterman::traceback(score_matrix, query, subject);
278+
279+
std::cout << "Aligned sequences:\n";
280+
std::cout << "Query: " << alignment.first << "\n";
281+
std::cout << "Subject: " << alignment.second << "\n";
282+
283+
// Find the maximum score
284+
int max_score = 0;
285+
for (const auto& row : score_matrix) {
286+
max_score = std::max(max_score, *std::max_element(row.begin(), row.end()));
287+
}
288+
std::cout << "Alignment score: " << max_score << "\n";
289+
290+
return 0;
291+
}

0 commit comments

Comments
 (0)