/* Dynamic-Programming + Divide & conquer for getting cost and aligned strings for problem of aligning two large strings Reference - Wikipedia-"Hirschberg's algorithm" */ #include using namespace std; //cost associated int cost = 0; //insertion cost inline int insC() { return 4; } //deletion cost inline int delC() { return 4; } //modification cost inline int modC(char fr, char to) { if(fr == to) return 0; return 3; } //reversing the string string rever(string s) { int k = s.length(); for(int i = 0; i < k/2; i++) swap(s[i], s[k-i-1]); return s; } //minimizing the sum of shortest paths (see:GainAlignment function), calculating next starting point int minimize(vector ve1, vector ve2, int le) { int sum, xmid = 0; for(int i = 0; i <= le; i++) { if(i == 0) sum = ve1[i] + ve2[le-i]; //reversing the array ve2 by taking its i element from last if(sum > ve1[i] + ve2[le-i]) { sum = ve1[i] + ve2[le-i]; xmid = i; } } return xmid; } pair stringOne(string s1, string s2) { //building of array for case string length one of any of the string string sa, sb; int m = s1.length(); int n = s2.length(); vector > fone(s1.length()+5, vector(2)); for(int i = 0; i <= m; i++) fone[i][0] = i*delC(); for(int j = 0; j <= n; j++) fone[0][j] = j*insC(); for(int i = 1; i <= m; i++) { int j; //recurrence for(j = 1; j <= n; j++) { fone[i][j] = min(modC(s1[i-1], s2[j-1]) + fone[i-1][j-1], min(delC() + fone[i-1][j], insC() + fone[i][j-1])); } } int i = m, j = n; cost += fone[i][j]; /* This code can be shortened as beforehand we know one of the string has length one but this gives general idea of a how a backtracking can be done is case of general strings length given we can use m*n space */ //backtracking in case the length is one of string while(true) { if(i == 0 || j == 0) break; //fone[i][j] will have one of the three above values from which it is derived so comapring from each one if(i >= 0 && j >= 0 && fone[i][j] == modC(s1[i-1], s2[j-1]) + fone[i-1][j-1]) { sa.push_back(s1[i-1]); sb.push_back(s2[j-1]); i--; j--; } else if((i-1) >= 0 && j >= 0 && fone[i][j] == delC() + fone[i-1][j]) { sa.push_back(s1[i-1]); sb.push_back('-'); i-=1; } else if(i >= 0 && (j-1) >= 0 && fone[i][j-1] == insC() + fone[i][j-1]){ sa.push_back('-'); sb.push_back(s2[j-1]); j-=1; } } //continue backtracking if(i != 0) { while(i) { sa.push_back(s1[i-1]); sb.push_back('-'); i--; } } else { while(j) { sa.push_back('-'); sb.push_back(s2[j-1]); j--; } } //strings obtained are reversed alignment because we have started from i = m, j = n reverse(sa.begin(), sa.end()); reverse(sb.begin(), sb.end()); return make_pair(sa, sb); } //getting the cost associated with alignment vector SpaceEfficientAlignment(string s1, string s2) { //space efficient version int m = s1.length(); int n = s2.length(); vector > array2d(m+5, vector(2)); //base conditions for(int i = 0; i <= m; i++) array2d[i][0] = i*(delC()); for(int j = 1; j <= n; j++) { array2d[0][1] = j*(insC()); //recurrence for(int i = 1; i <= m; i++) { array2d[i][1] = min(modC(s1[i-1], s2[j-1]) + array2d[i-1][0], min(delC() + array2d[i-1][1], insC() + array2d[i][0])); } for(int i = 0; i <= m; i++) array2d[i][0] = array2d[i][1]; } //returning the last column to get the row element x in n/2 column: see GainAlignment function vector vec(m+1); for(int i = 0; i <= m; i++) { vec[i] = array2d[i][1]; } return vec; } pair GainAlignment(string s1, string s2) { string te1, te2; //for storing alignments int l1 = s1.length(); int l2 = s2.length(); //trivial cases of length = 0 or length = 1 if(l1 == 0) { for(int i = 0; i < l2; i++) { te1.push_back('-'); te2.push_back(s2[i]); cost += insC(); } } else if(l2 == 0) { for(int i = 0; i < l1; i++) { te1.push_back(s1[i]); te2.push_back('-'); cost += delC(); } } else if(l1 == 1 || l2 == 1) { pair temp = stringOne(s1, s2); te1 = temp.first; te2 = temp.second; } //main divide and conquer else { int ymid = l2/2; /* We know edit distance problem can be seen as shortest path from initial(0,0) to (l1,l2) Now, here we are seeing it in two parts from (0,0) to (i,j) and from (i+1,j+1) to (m,n) and we will see for which i it is getting minimize. */ vector ScoreL = SpaceEfficientAlignment(s1, s2.substr(0, ymid)); //for distance (0,0) to (i,j) vector ScoreR = SpaceEfficientAlignment(rever(s1), rever(s2.substr(ymid, l2-ymid))); //for distance (i+1,j+1) to (m,n) int xmid = minimize(ScoreL, ScoreR, l1); //minimizing the distance pair temp = GainAlignment(s1.substr(0, xmid), s2.substr(0, ymid)); te1 = temp.first; te2 = temp.second; //storing the alignment temp = GainAlignment(s1.substr(xmid, l1-xmid), s2.substr(ymid, l2-ymid)); te1 += temp.first; te2 += temp.second; //storing the alignment } return make_pair(te1, te2); } int main() { string s1, s2; s1 = "GCCCTAGCG"; s2 = "GCGCAATG"; //cin >> s1 >> s2; /*standard input stream*/ /* If reading from strings from two files */ // ifstream file1("num.txt"); // ifstream file2("nu.txt"); // getline(file1, s1); // file1.close(); // getline(file2, s2); // file2.close(); pair temp = GainAlignment(s1, s2); //Alignments can be different for same strings but cost will be same cout << "Alignments of strings\n"; cout << temp.first << "\n"; cout << temp.second << "\n"; cout << "Cost associated = " << cost << "\n"; /* Alignments M - modification, D - deletion, I - insertion for converting string1 to string2 M M MD string1 - GCCCTAGCG string2 - GCGCAAT-G */ }