TheAlgorithms-C/misc/mcnaughton_yamada_thompson.c
Sharon "Cass" Cassidy e1fcdd2a04
feat: Add McNaughton–Yamada–Thompson algorithm (#1241)
* updating DIRECTORY.md

* Create mcnaughton_yamada_thompson.c

* updating DIRECTORY.md

* Update mcnaughton_yamada_thompson.c

* fix some memory leaks

* fix another memory leak

* Update mcnaughton_yamada_thompson.c

* added more test cases

* a few formatting changes

* another few SPaG changes

* Update misc/mcnaughton_yamada_thompson.c

Co-authored-by: David Leal <halfpacho@gmail.com>

* updating DIRECTORY.md

---------

Co-authored-by: github-actions[bot] <github-actions@users.noreply.github.com>
Co-authored-by: David Leal <halfpacho@gmail.com>
2023-04-11 18:57:27 -06:00

722 lines
21 KiB
C
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/**
* @file
* @brief [McNaughtonYamadaThompson algorithm](https://en.wikipedia.org/wiki/Thompson%27s_construction)
* @details
* From Wikipedia:
* In computer science, Thompson's construction algorithm,
* also called the McNaughtonYamadaThompson algorithm,
* is a method of transforming a regular expression into
* an equivalent nondeterministic finite automaton (NFA).
* This implementation implements the all three operations
* (implicit concatenation, '|' for union, '*' for Kleene star)
* required by the formal definition of regular expressions.
* @author [Sharon Cassidy](https://github.com/CascadingCascade)
*/
#include <assert.h> /// for assert()
#include <stdio.h> /// for IO operations
#include <string.h> /// for string operations
#include <stdlib.h> /// for memory management
/* Begin declarations, I opted to place various helper / utility functions
* close to their usages and didn't split their declaration / definition */
/**
* @brief Definition for a binary abstract syntax tree (AST) node
*/
struct ASTNode {
char content; ///< the content of this node
struct ASTNode* left; ///< left child
struct ASTNode* right; ///< right child
};
struct ASTNode* createNode(char content);
void destroyNode(struct ASTNode* node);
char* preProcessing(const char* input);
struct ASTNode* buildAST(const char* input);
/**
* @brief Definition for a NFA state transition rule
*/
struct transRule {
struct NFAState* target; ///< pointer to the state to transit to
char cond; ///< the input required to activate this transition
};
struct transRule* createRule(struct NFAState* state, char c);
void destroyRule(struct transRule* rule);
/**
* @brief Definition for a NFA state. Each NFAState object is initialized
* to have a capacity of three rules, since there will only be at most two
* outgoing rules and one empty character circular rule in this algorithm
*/
struct NFAState {
int ruleCount; ///< number of transition rules this state have
struct transRule** rules; ///< the transition rules
};
struct NFAState* createState(void);
void destroyState(struct NFAState* state);
/**
* @brief Definition for the NFA itself.
* statePool[0] is defined to be its starting state,
* and statePool[1] is defined to be its accepting state.
* for simplicity's sake all NFAs are initialized to have
* a small fixed capacity, although due to the recursive nature
* of this algorithm this capacity is believed to be sufficient
*/
struct NFA {
int stateCount; ///< the total number of states this NFA have
struct NFAState** statePool; ///< the pool of all available states
int ruleCount; ///< the total number of transition rules in this NFA
struct transRule** rulePool; ///< the pool of all transition rules
int CSCount; ///< the number of currently active states
struct NFAState** currentStates; ///< the pool of all active states
int subCount; ///< the number of sub NFAs
struct NFA** subs; ///< the pool of all sub NFAs
int wrapperFlag; ///< whether this NFA is a concatenation wrapper
};
struct NFA* createNFA(void);
void destroyNFA(struct NFA* nfa);
void addState(struct NFA* nfa, struct NFAState* state);
void addRule(struct NFA* nfa, struct transRule* rule, int loc);
void postProcessing(struct NFA* nfa);
void transit(struct NFA* nfa, char input);
int isAccepting(const struct NFA* nfa);
/* End definitions, begin abstract syntax tree construction */
/**
* @brief helper function to determine whether a character should be
* considered a character literal
* @param ch the character to be tested
* @returns `1` if it is a character literal
* @returns `0` otherwise
*/
int isLiteral(const char ch) {
return !(ch == '(' || ch == ')' || ch == '*' || ch == '\n' || ch == '|');
}
/**
* @brief performs preprocessing on a regex string,
* making all implicit concatenations explicit
* @param input target regex string
* @returns pointer to the processing result
*/
char* preProcessing(const char* input) {
const size_t len = strlen(input);
if(len == 0) {
char* str = malloc(1);
str[0] = '\0';
return str;
}
char* str = malloc(len * 2);
size_t op = 0;
for (size_t i = 0; i < len - 1; ++i) {
char c = input[i];
str[op++] = c;
// one character lookahead
char c1 = input[i + 1];
if( (isLiteral(c) && isLiteral(c1)) ||
(isLiteral(c) && c1 == '(') ||
(c == ')' && c1 == '(') ||
(c == ')' && isLiteral(c1)) ||
(c == '*' && isLiteral(c1)) ||
(c == '*' && c1 == '(')
) {
// '\n' is used to represent concatenation
// in this implementation
str[op++] = '\n';
}
}
str[op++] = input[len - 1];
str[op] = '\0';
return str;
}
/**
* @brief utility function to locate the first occurrence
* of a character in a string while respecting parentheses
* @param str target string
* @param key the character to be located
* @returns the index of its first occurrence, `0` if it could not be found
*/
size_t indexOf(const char* str, char key) {
int depth = 0;
for (size_t i = 0; i < strlen(str); ++i) {
const char c = str[i];
if(depth == 0 && c == key) {
return i;
}
if(c == '(') depth++;
if(c == ')') depth--;
}
// Due to the way this function is intended to be used,
// it's safe to assume the character will not appear as
// the string's first character
// thus `0` is used as the `not found` value
return 0;
}
/**
* @brief utility function to create a subString
* @param str target string
* @param begin starting index, inclusive
* @param end ending index, inclusive
* @returns pointer to the newly created subString
*/
char* subString(const char* str, size_t begin, size_t end) {
char* res = malloc(end - begin + 2);
strncpy(res, str + begin, end - begin + 1);
res[end - begin + 1] = '\0';
return res;
}
/**
* @brief recursively constructs a AST from a preprocessed regex string
* @param input regex
* @returns pointer to the resulting tree
*/
struct ASTNode* buildAST(const char* input) {
struct ASTNode* node = createNode('\0');
node->left = NULL;
node->right = NULL;
const size_t len = strlen(input);
size_t index;
// Empty input
if(len == 0) return node;
// Character literals
if(len == 1) {
node->content = input[0];
return node;
}
// Discard parentheses
if(input[0] == '(' && input[len - 1] == ')') {
char* temp = subString(input, 1, len - 2);
destroyNode(node);
node = buildAST(temp);
free(temp);
return node;
}
// Union
index = indexOf(input, '|');
if(index) {
node->content = '|';
char* temp1 = subString(input, 0, index - 1);
char* temp2 = subString(input, index + 1, len - 1);
node->left = buildAST(temp1);
node->right = buildAST(temp2);
free(temp2);
free(temp1);
return node;
}
// Concatenation
index = indexOf(input, '\n');
if(index) {
node->content = '\n';
char* temp1 = subString(input, 0, index - 1);
char* temp2 = subString(input, index + 1, len - 1);
node->left = buildAST(temp1);
node->right = buildAST(temp2);
free(temp2);
free(temp1);
return node;
}
// Kleene star
// Testing with indexOf() is unnecessary here,
// Since all other possibilities have been exhausted
node->content = '*';
char* temp = subString(input, 0, len - 2);
node->left = buildAST(temp);
node->right = NULL;
free(temp);
return node;
}
/* End AST construction, begins the actual algorithm itself */
/**
* @brief helper function to recursively redirect transition rule targets
* @param nfa target NFA
* @param src the state to redirect away from
* @param dest the state to redirect to
* @returns void
*/
void redirect(struct NFA* nfa, struct NFAState* src, struct NFAState* dest) {
for (int i = 0; i < nfa->subCount; ++i) {
redirect(nfa->subs[i], src, dest);
}
for (int i = 0; i < nfa->ruleCount; ++i) {
struct transRule* rule = nfa->rulePool[i];
if (rule->target == src) {
rule->target = dest;
}
}
}
struct NFA* compileFromAST(struct ASTNode* root) {
struct NFA* nfa = createNFA();
// Empty input
if (root->content == '\0') {
addRule(nfa, createRule(nfa->statePool[1], '\0'), 0);
return nfa;
}
// Character literals
if (isLiteral(root->content)) {
addRule(nfa, createRule(nfa->statePool[1], root->content), 0);
return nfa;
}
switch (root->content) {
case '\n': {
struct NFA* ln = compileFromAST(root->left);
struct NFA* rn = compileFromAST(root->right);
// Redirects all rules targeting ln's accepting state to
// target rn's starting state
redirect(ln, ln->statePool[1], rn->statePool[0]);
// Manually creates and initializes a special
// "wrapper" NFA
destroyNFA(nfa);
struct NFA* wrapper = malloc(sizeof(struct NFA));
wrapper->stateCount = 2;
wrapper->statePool = malloc(sizeof(struct NFAState*) * 2);
wrapper->subCount = 0;
wrapper->subs = malloc(sizeof(struct NFA*) * 2);
wrapper->ruleCount = 0;
wrapper->rulePool = malloc(sizeof(struct transRule*) * 3);
wrapper->CSCount = 0;
wrapper->currentStates = malloc(sizeof(struct NFAState*) * 2);
wrapper->wrapperFlag = 1;
wrapper->subs[wrapper->subCount++] = ln;
wrapper->subs[wrapper->subCount++] = rn;
// Maps the wrapper NFA's starting and ending states
// to its sub NFAs
wrapper->statePool[0] = ln->statePool[0];
wrapper->statePool[1] = rn->statePool[1];
return wrapper;
}
case '|': {
struct NFA* ln = compileFromAST(root->left);
struct NFA* rn = compileFromAST(root->right);
nfa->subs[nfa->subCount++] = ln;
nfa->subs[nfa->subCount++] = rn;
// Adds empty character transition rules
addRule(nfa, createRule(ln->statePool[0], '\0'), 0);
addRule(ln, createRule(nfa->statePool[1], '\0'), 1);
addRule(nfa, createRule(rn->statePool[0], '\0'), 0);
addRule(rn, createRule(nfa->statePool[1], '\0'), 1);
return nfa;
}
case '*': {
struct NFA* ln = compileFromAST(root->left);
nfa->subs[nfa->subCount++] = ln;
addRule(ln, createRule(ln->statePool[0], '\0'), 1);
addRule(nfa, createRule(ln->statePool[0], '\0'), 0);
addRule(ln, createRule(nfa->statePool[1], '\0'), 1);
addRule(nfa, createRule(nfa->statePool[1], '\0'), 0);
return nfa;
}
}
// Fallback, shouldn't happen in normal operation
destroyNFA(nfa);
return NULL;
}
/* Ends the algorithm, begins NFA utility functions*/
/**
* @brief adds a state to a NFA
* @param nfa target NFA
* @param state the NFA state to be added
* @returns void
*/
void addState(struct NFA* nfa, struct NFAState* state) {
nfa->statePool[nfa->stateCount++] = state;
}
/**
* @brief adds a transition rule to a NFA
* @param nfa target NFA
* @param rule the rule to be added
* @param loc which state this rule should be added to
* @returns void
*/
void addRule(struct NFA* nfa, struct transRule* rule, int loc) {
nfa->rulePool[nfa->ruleCount++] = rule;
struct NFAState* state = nfa->statePool[loc];
state->rules[state->ruleCount++] = rule;
}
/**
* @brief performs postprocessing on a compiled NFA,
* add circular empty character transition rules where
* it's needed for the NFA to function correctly
* @param nfa target NFA
* @returns void
*/
void postProcessing(struct NFA* nfa) {
// Since the sub NFA's states and rules are managed
// through their own pools, recursion is necessary
for (int i = 0; i < nfa->subCount; ++i) {
postProcessing(nfa->subs[i]);
}
// If a state does not have any empty character accepting rule,
// we add a rule that circles back to itself
// So this state will be preserved when
// empty characters are inputted
for (int i = 0; i < nfa->stateCount; ++i) {
struct NFAState* pState = nfa->statePool[i];
int f = 0;
for (int j = 0; j < pState->ruleCount; ++j) {
if(pState->rules[j]->cond == '\0') {
f = 1;
break;
}
}
if (!f) {
addRule(nfa, createRule(pState, '\0'), i);
}
}
}
/**
* @brief helper function to determine an element's presence in an array
* @param states target array
* @param len length of the target array
* @param state the element to search for
* @returns `1` if the element is present, `0` otherwise
*/
int contains(struct NFAState** states, int len, struct NFAState* state) {
int f = 0;
for (int i = 0; i < len; ++i) {
if(states[i] == state) {
f = 1;
break;
}
}
return f;
}
/**
* @brief helper function to manage empty character transitions
* @param target target NFA
* @param states pointer to results storage location
* @param sc pointer to results count storage location
* @returns void
*/
void findEmpty(struct NFAState* target, struct NFAState** states, int *sc) {
for (int i = 0; i < target->ruleCount; ++i) {
const struct transRule *pRule = target->rules[i];
if (pRule->cond == '\0' && !contains(states, *sc, pRule->target)) {
states[(*sc)++] = pRule->target;
// the use of `states` and `sc` is necessary
// to sync data across recursion levels
findEmpty(pRule->target, states, sc);
}
}
}
/**
* @brief moves a NFA forward
* @param nfa target NFA
* @param input the character to be fed into the NFA
* @returns void
*/
void transit(struct NFA* nfa, char input) {
struct NFAState** newStates = malloc(sizeof(struct NFAState*) * 10);
int NSCount = 0;
if (input == '\0') {
// In case of empty character input, it's possible for
// a state to transit to another state that's more than
// one rule away, we need to take that into account
for (int i = nfa->CSCount - 1; i > -1; --i) {
struct NFAState *pState = nfa->currentStates[i];
nfa->CSCount--;
struct NFAState** states = malloc(sizeof(struct NFAState*) * 10);
int sc = 0;
findEmpty(pState, states, &sc);
for (int j = 0; j < sc; ++j) {
if(!contains(newStates,NSCount, states[j])) {
newStates[NSCount++] = states[j];
}
}
free(states);
}
} else {
// Iterates through all current states
for (int i = nfa->CSCount - 1; i > -1; --i) {
struct NFAState *pState = nfa->currentStates[i];
// Gradually empties the current states pool, so
// it can be refilled
nfa->CSCount--;
// Iterates through rules of this state
for (int j = 0; j < pState->ruleCount; ++j) {
const struct transRule *pRule = pState->rules[j];
if(pRule->cond == input) {
if(!contains(newStates, NSCount, pRule->target)) {
newStates[NSCount++] = pRule->target;
}
}
}
}
}
nfa->CSCount = NSCount;
for (int i = 0; i < NSCount; ++i) {
nfa->currentStates[i] = newStates[i];
}
free(newStates);
}
/**
* @brief determines whether the NFA is currently in its accepting state
* @param nfa target NFA
* @returns `1` if the NFA is in its accepting state
* @returns `0` otherwise
*/
int isAccepting(const struct NFA* nfa) {
for (int i = 0; i < nfa->CSCount; ++i) {
if(nfa->currentStates[i] == nfa->statePool[1]) {
return 1;
}
}
return 0;
}
/* Ends NFA utilities, begins testing function*/
/**
* @brief Testing helper function
* @param regex the regular expression to be used
* @param string the string to match against
* @param expected expected results
* @returns void
*/
void testHelper(const char* regex, const char* string, const int expected) {
char* temp = preProcessing(regex);
struct ASTNode* node = buildAST(temp);
struct NFA* nfa = compileFromAST(node);
postProcessing(nfa);
// reallocates the outermost NFA's current states pool
// because it will actually be used to store all the states
nfa->currentStates = realloc(nfa->currentStates, sizeof(struct NFAState*) * 100);
// Starts the NFA by adding its starting state to the pool
nfa->currentStates[nfa->CSCount++] = nfa->statePool[0];
// feeds empty characters into the NFA before and after
// every normal character
for (size_t i = 0; i < strlen(string); ++i) {
transit(nfa, '\0');
transit(nfa, string[i]);
}
transit(nfa, '\0');
assert(isAccepting(nfa) == expected);
destroyNFA(nfa);
destroyNode(node);
free(temp);
}
/**
* @brief Self-test implementations
* @returns void
*/
static void test(void) {
testHelper("(c|a*b)", "c", 1);
testHelper("(c|a*b)", "aab", 1);
testHelper("(c|a*b)", "ca", 0);
testHelper("(c|a*b)*", "caaab", 1);
testHelper("(c|a*b)*", "caba", 0);
testHelper("", "", 1);
testHelper("", "1", 0);
testHelper("(0|(1(01*(00)*0)*1)*)*","11",1);
testHelper("(0|(1(01*(00)*0)*1)*)*","110",1);
testHelper("(0|(1(01*(00)*0)*1)*)*","1100",1);
testHelper("(0|(1(01*(00)*0)*1)*)*","10000",0);
testHelper("(0|(1(01*(00)*0)*1)*)*","00000",1);
printf("All tests have successfully passed!\n");
}
/**
* @brief Main function
* @returns 0 on exit
*/
int main(void) {
test(); // run self-test implementations
return 0;
}
/* I opted to place these more-or-less boilerplate code and their docs
* at the end of file for better readability */
/**
* @brief creates and initializes a AST node
* @param content data to initializes the node with
* @returns pointer to the newly created node
*/
struct ASTNode* createNode(const char content) {
struct ASTNode* node = malloc(sizeof(struct ASTNode));
node->content = content;
node->left = NULL;
node->right = NULL;
return node;
}
/**
* @brief recursively destroys a AST
* @param node the root node of the tree to be deleted
* @returns void
*/
void destroyNode(struct ASTNode* node) {
if(node->left != NULL) {
destroyNode(node->left);
}
if(node->right != NULL) {
destroyNode(node->right);
}
free(node);
}
/**
* @brief creates and initializes a transition rule
* @param state transition target
* @param c transition condition
* @returns pointer to the newly created rule
*/
struct transRule* createRule(struct NFAState* state, char c) {
struct transRule* rule = malloc(sizeof(struct transRule));
rule->target = state;
rule->cond = c;
return rule;
}
/**
* @brief destroys a transition rule object
* @param rule pointer to the object to be deleted
* @returns void
*/
void destroyRule(struct transRule* rule) {
free(rule);
}
/**
* @brief creates and initializes a NFA state
* @returns pointer to the newly created NFA state
*/
struct NFAState* createState(void) {
struct NFAState* state = malloc(sizeof(struct NFAState));
state->ruleCount = 0;
state->rules = malloc(sizeof(struct transRule*) * 3);
return state;
}
/**
* @brief destroys a NFA state
* @param state pointer to the object to be deleted
* @returns void
*/
void destroyState(struct NFAState* state) {
free(state->rules);
free(state);
}
/**
* @brief creates and initializes a NFA
* @returns pointer to the newly created NFA
*/
struct NFA* createNFA(void) {
struct NFA* nfa = malloc(sizeof(struct NFA));
nfa->stateCount = 0;
nfa->statePool = malloc(sizeof(struct NFAState*) * 5);
nfa->ruleCount = 0;
nfa->rulePool = malloc(sizeof(struct transRule*) * 10);
nfa->CSCount = 0;
nfa->currentStates = malloc(sizeof(struct NFAState*) * 5);
nfa->subCount = 0;
nfa->subs = malloc(sizeof(struct NFA*) * 5);
nfa->wrapperFlag = 0;
addState(nfa, createState());
addState(nfa, createState());
return nfa;
}
/**
* @brief recursively destroys a NFA
* @param nfa pointer to the object to be deleted
* @returns void
*/
void destroyNFA(struct NFA* nfa) {
for (int i = 0; i < nfa->subCount; ++i) {
destroyNFA(nfa->subs[i]);
}
// In case of a wrapper NFA, do not free its states
// because it doesn't really have any states of its own
if (!nfa->wrapperFlag) {
for (int i = 0; i < nfa->stateCount; ++i) {
destroyState(nfa->statePool[i]);
}
}
for (int i = 0; i < nfa->ruleCount; ++i) {
destroyRule(nfa->rulePool[i]);
}
free(nfa->statePool);
free(nfa->currentStates);
free(nfa->rulePool);
free(nfa->subs);
free(nfa);
}