/** * @file * @brief [Longest Common Subsequence](https://en.wikipedia.org/wiki/Longest_common_subsequence_problem) algorithm * @details * From Wikipedia: The longest common subsequence (LCS) problem is the problem * of finding the longest subsequence common to all sequences in a set of sequences * (often just two sequences). * @author [Kurtz](https://github.com/itskurtz) */ #include /* for io operations */ #include /* for memory management & exit */ #include /* for string manipulation & ooperations */ #include /* for asserts */ enum {LEFT, UP, DIAG}; /** * @breif Computes LCS between s1 and s2 using a dynamic-programming approach * @param1 s1 first null-terminated string * @param2 s2 second null-terminated string * @param3 l1 length of s1 * @param4 l2 length of s2 * @param5 L matrix of size l1 x l2 * @param6 B matrix of size l1 x l2 * @returns void */ void lcslen(const char *s1, const char *s2, int l1, int l2, int **L, int **B) { /* B is the directions matrix L is the LCS matrix */ int i, j; /* loop over the simbols in my sequences save the directions according to the LCS */ for (i = 1; i <= l1; ++i) for (j = 1; j <= l2; ++j) if (s1[i-1] == s2[j-1]) { L[i][j] = 1 + L[i-1][j-1]; B[i][j] = DIAG; } else if (L[i-1][j] < L[i][j-1]) { L[i][j] = L[i][j-1]; B[i][j] = LEFT; } else { L[i][j] = L[i-1][j]; B[i][j] = UP; } } /** * @breif Builds the LCS according to B using a traceback approach * @param1 s1 first null-terminated string * @param2 l1 length of s1 * @param3 l2 length of s2 * @param4 L matrix of size l1 x l2 * @param5 B matrix of size l1 x l2 * @returns lcs longest common subsequence */ char *lcsbuild(const char *s1, int l1, int l2, int **L, int **B) { int i, j, lcsl; char *lcs; lcsl = L[l1][l2]; /* my lcs is at least the empty symbol */ lcs = (char *)calloc(lcsl+1, sizeof(char)); /* null-terminated \0 */ if (!lcs) { perror("calloc: "); return NULL; } i = l1, j = l2; while (i > 0 && j > 0) { /* walk the matrix backwards */ if (B[i][j] == DIAG) { lcs[--lcsl] = s1[i-1]; i = i - 1; j = j - 1; } else if (B[i][j] == LEFT) j = j - 1; else i = i - 1; } return lcs; } /** * @brief Self-test implementations * @returns void */ static void test() { /* https://en.wikipedia.org/wiki/Subsequence#Applications */ int **L, **B, j, l1, l2; char *s1 = "ACGGTGTCGTGCTATGCTGATGCTGACTTATATGCTA"; char *s2 = "CGTTCGGCTATCGTACGTTCTATTCTATGATTTCTAA"; char *lcs; l1 = strlen(s1); l2 = strlen(s2); L = (int **)calloc(l1+1, sizeof(int *)); B = (int **)calloc(l1+1, sizeof(int *)); if (!L) { perror("calloc: "); exit(1); } if (!B) { perror("calloc: "); exit(1); } for (j = 0; j <= l1; j++) { L[j] = (int *)calloc(l2+1, sizeof(int)); if (!L[j]) { perror("calloc: "); exit(1); } B[j] = (int *)calloc(l2+1, sizeof(int)); if (!L[j]) { perror("calloc: "); exit(1); } } lcslen(s1, s2, l1, l2, L, B); lcs = lcsbuild(s1, l1, l2, L, B); assert(L[l1][l2] == 27); assert(strcmp(lcs, "CGTTCGGCTATGCTTCTACTTATTCTA") == 0); printf("S1: %s\tS2: %s\n", s1, s2); printf("LCS len:%3d\n", L[l1][l2]); printf("LCS: %s\n", lcs); free(lcs); for (j = 0; j <= l1; j++) free(L[j]), free(B[j]); free(L); free(B); printf("All tests have successfully passed!\n"); } /** * @brief Main function * @param argc commandline argument count (ignored) * @param argv commandline array of arguments (ignored) * @returns 0 on exit */ int main(int argc, char *argv[]) { test(); // run self-test implementations return 0; }