Aho-Corasick algorithm implemented in C

Question 1

I've been reimplementing the fgrep utility in C for fun & study, and so had to implement the Aho-Corasick algorithm. I'm trying to improve my C skills in general, so any and all feedback would be greatly appreciated.

The project is compiled using gcc and gnu17.

Thanks in advance :]

aho_corasick.h/c

/*
 * FILE: aho_corasick.h
 * ABOUT: Definitions for the aho-corasick string matching algorithm.
 * See https://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_algorithm
 */
#ifndef AHO_CORASIC_H
#define AHO_CORASIC_H
#include <stdlib.h>
#include <sys/queue.h>
struct ac_node;
typedef struct ac_node ac_node_t;
typedef struct ac_automaton {
 ac_node_t *root;
} ac_automaton_t;
/**
 * Creates a new automaton used in string matching.
 * @param words Dictionary of words to be matched against.
 * @param wc Number of words in words.
 * @return An automaton build from the dictionary.
 */
ac_automaton_t new_automaton(const char **words, size_t wc);
/**
 * Frees the memory allocated by the automaton.
 * @param automaton Automaton to be deleted.
 */
void delete_automaton(ac_automaton_t automaton);
typedef struct match
{
 size_t start, length;
 LIST_ENTRY(match) entries;
} match_t;
typedef LIST_HEAD(match_list_head, match) match_list_head_t;
/**
 * Frees the memory allocated by the match-list.
 * @param head Head of the match-list to be deleted.
 */
void delete_match_list(match_list_head_t *head);
/**
 * Finds all substring matching an of the automatons dictionary.
 * @param str String to be searched.
 * @param automaton Automaton used in matching.
 * @return A list of matches in order of appearence in str.
 */
match_list_head_t *match(const char *str, const ac_automaton_t automaton);
#endif

/*
 * FILE: aho_corasick.c
 * ABOUT: Implementation of the aho-corasick string matching algorithm defined in aho_corasick.h.
 * See https://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_algorithm
 */
#include <stdlib.h>
#include <sys/queue.h>
#include <stdbool.h>
#include "aho_corasick.h"
#define ASCII_CHAR_COUNT 256 // No. of valid (extended) ASCII characters
struct ac_node
{
 char val;
 bool is_final;
 size_t length;
 struct ac_node *parent;
 struct ac_node *children[ASCII_CHAR_COUNT];
 struct ac_node *suffix_link, *dict_suffix_link;
};
ac_node_t *new_node(ac_node_t *parent, char val)
{
 ac_node_t *node = calloc(1, sizeof(ac_node_t));
 node->parent = parent;
 node->val = val;
 node->length = parent == NULL
 ? 0
 : parent->length + 1;
 
 return node;
}
typedef struct node_q_entry
{
 ac_node_t *node;
 STAILQ_ENTRY(node_q_entry) entries;
} node_q_entry_t;
typedef STAILQ_HEAD(node_q_head, node_q_entry) node_q_head_t;
static void enqueue(node_q_head_t *head, ac_node_t *node)
{
 node_q_entry_t *entry = malloc(sizeof(node_q_entry_t));
 entry->node = node;
 STAILQ_INSERT_TAIL(head, entry, entries);
}
static ac_node_t *dequeue(node_q_head_t *head)
{
 node_q_entry_t *entry = STAILQ_FIRST(head);
 STAILQ_REMOVE_HEAD(head, entries);
 ac_node_t *node = entry->node;
 free(entry);
 return node;
}
static void insert_word(ac_node_t *root, const char *word)
{
 ac_node_t *node = root;
 while (*word != '0円')
 {
 int c = *word;
 if (node->children[c] == NULL)
 node->children[c] = new_node(node, c);
 node = node->children[c];
 ++word;
 }
 node->is_final = true;
}
static bool is_root(const ac_node_t *node)
{
 return node->parent == NULL;
}
static ac_node_t *find_suffix_link(ac_node_t *node)
{
 if (is_root(node)) return NULL;
 if (is_root(node->parent)) return node->parent;
 int val = node->val;
 node = node->parent->suffix_link;
 while (node->children[val] == NULL)
 {
 if (is_root(node)) return node;
 node = node->parent;
 }
 return node->children[val];
}
static ac_node_t *find_dict_suffix_link(ac_node_t *node)
{
 node = node->suffix_link;
 while (node != NULL)
 {
 if (node->is_final) return node;
 node = node->suffix_link;
 }
 
 return NULL;
}
ac_automaton_t new_automaton(const char **words, size_t wc)
{
 ac_node_t *root = new_node(NULL, '0円');
 // Insert words
 for (size_t i = 0; i < wc; ++i)
 {
 const char *w = words[i];
 insert_word(root, w);
 }
 
 // Add (dict) suffix links using breadth-first search
 node_q_head_t head;
 STAILQ_INIT(&head);
 enqueue(&head, root);
 while (!STAILQ_EMPTY(&head))
 {
 ac_node_t *node = dequeue(&head);
 node->suffix_link = find_suffix_link(node);
 node->dict_suffix_link = find_dict_suffix_link(node);
 for (int i = 0; i < ASCII_CHAR_COUNT; ++i)
 {
 ac_node_t *child = node->children[i];
 if (child != NULL) enqueue(&head, child);
 } 
 }
 return (ac_automaton_t) { .root = root };
}
void delete_automaton(ac_automaton_t automaton)
{
 node_q_head_t head;
 STAILQ_INIT(&head);
 enqueue(&head, automaton.root);
 while (!STAILQ_EMPTY(&head))
 {
 ac_node_t *node = dequeue(&head);
 for (int i = 0; i < ASCII_CHAR_COUNT; ++i)
 {
 ac_node_t *child = node->children[i];
 if (child != NULL) enqueue(&head, child);
 } 
 free(node);
 }
}
void delete_match_list(match_list_head_t *head)
{
 match_t *m1 = LIST_FIRST(head);
 while (m1 != NULL) {
 match_t *m2 = LIST_NEXT(m1, entries);
 free(m1);
 m1 = m2;
 }
 LIST_INIT(head);
}
static void add_match(match_list_head_t *head, size_t pos, size_t length)
{
 static match_t *last_match = NULL;
 match_t *m = malloc(sizeof(match_t));
 m->start = pos - length + 1;
 m->length = length;
 if (last_match == NULL)
 LIST_INSERT_HEAD(head, m, entries);
 else
 LIST_INSERT_AFTER(last_match, m, entries);
 last_match = m;
}
match_list_head_t *match(const char *str, const ac_automaton_t automaton)
{
 match_list_head_t *head = malloc(sizeof(match_list_head_t));
 LIST_INIT(head);
 ac_node_t *node = automaton.root;
 size_t pos = 0;
 for (int c = *str; c != '0円'; c = *(++str))
 {
 // Follow children & suffix links
 while (node->children[c] == NULL)
 {
 node = node->suffix_link;
 if (node == NULL) break;
 }
 node = node == NULL
 ? automaton.root
 : node->children[c];
 // Look for matches
 if (node->is_final)
 add_match(head, pos, node->length);
 ac_node_t *dict_node = node->dict_suffix_link;
 while (dict_node != NULL)
 {
 add_match(head, pos, dict_node->length);
 dict_node = dict_node->dict_suffix_link;
 }
 ++pos;
 }
 
 return head;
}

test.c

#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <sys/queue.h>
#include "src/aho_corasick.h"
int main(void)
{
 // wikipedia examples
 const char* wiki_words[] = {
 "a",
 "ab",
 "bab",
 "bc",
 "bca",
 "c",
 "caa",
 };
 size_t wiki_wc = sizeof(wiki_words)/sizeof(wiki_words[0]);
 ac_automaton_t wiki_automaton = new_automaton(wiki_words, wiki_wc);
 match_list_head_t *head = match("abccab", wiki_automaton);
 match_t *m = LIST_FIRST(head);
 assert(m->start == 0);
 assert(m->length == 1);
 m = LIST_NEXT(m, entries);
 assert(m->start == 0);
 assert(m->length == 2);
 
 m = LIST_NEXT(m, entries);
 assert(m->start == 1);
 assert(m->length == 2);
 
 m = LIST_NEXT(m, entries);
 assert(m->start == 2);
 assert(m->length == 1);
 
 m = LIST_NEXT(m, entries);
 assert(m->start == 3);
 assert(m->length == 1);
 
 m = LIST_NEXT(m, entries);
 assert(m->start == 4);
 assert(m->length == 1);
 
 m = LIST_NEXT(m, entries);
 assert(m->start == 4);
 assert(m->length == 2);
 m = LIST_NEXT(m, entries);
 assert(m == NULL);
 
 delete_match_list(head);
 assert(LIST_EMPTY(head));
 delete_automaton(wiki_automaton);
 puts("Tests ran succesfully!");
 return EXIT_SUCCESS;
}

Makefile

SRC_DIR := ./src
OBJ_DIR := ./obj
BIN_DIR := ./bin
CC := gcc
CFLAGS := -Wall -ggdb
$(OBJ_DIR)/aho_corasick.o: $(SRC_DIR)/aho_corasick.h $(SRC_DIR)/aho_corasick.c
 @mkdir -p $(OBJ_DIR)
 $(CC) -c -o $(OBJ_DIR)/aho_corasick.o $(SRC_DIR)/aho_corasick.c $(CFLAGS)
.PHONY: test clean
test: $(OBJ_DIR)/aho_corasick.o
 @mkdir -p $(BIN_DIR)
 $(CC) -o $(BIN_DIR)/test test.c $(OBJ_DIR)/* $(CFLAGS)
 $(BIN_DIR)/test
clean:
 rm -rf $(BIN_DIR) $(OBJ_DIR)

Project Layout

.
├── Makefile
├── src
│  ├── aho_corasick.c
│  └── aho_corasick.h
└── test.c

Question 2

Looks nice to me. I don't see you handling calloc()/malloc() failure properly. Review later. Kindly specify whether you're using C99, C11, C17, or C23.

Question 3

You're using true without including <stdbool.h>. Are you perchance not compiling with warnings enabled, or compiling with std=c2x/c23/gnu2x/gnu23, or ignoring the warnings? Can you include the contents of the Makefile?

Question 4

Hi, I've specified the compiler & standard in the question.

Question 5

ISO C11 doesn't have sys/queue.h albeit, and doesn't know about true, false, and bool unless you incude <stdbool.h>. The Linux man page states that sys/queue.h is part of the BSD Standard, so not even POSIX.

Question 6

@Harith Thanks, I've added the Makefile and fixed the mistake (I'm using gcc 11.3.0 which should default to gnu17, sorry for the confusion 🤦). stdbool should have been imported at the beginning of aho_corasick.c.

Question 7

references

Woo hoo! You cite a reference for the underlying algorithm. Excellent.

In the header comments for match(), nit: Minor grammar / typos.

includes

#include <stdlib.h>
#include <sys/queue.h>
#include <stdbool.h>
#include "aho_corasick.h"

IDK, maybe elide the first two? Given that your header will pull them in? Or maybe there's some common wisdom about this sort of thing, like keeping the first two makes certain static analysis easier, so I should just stay mum.

EDIT: @Harith offers this sage advice about being robust to unrelated file edits:

the source files should not depend on their corresponding header files to include their dependencies, neither should the header file include what it doesn't require.

Also, thank you for being explicit about the "no unicode!" assumption, whew!, what a relief.

OIC, the ASCII_CHAR_COUNT definition makes it very easy to e.g. impose a 7-bit US-ASCII restriction on inputs, immediately cutting memory use in approximately half. Nice.

negative char

struct ac_node
{
 char val;

Hmmm, signed char, this will be exciting. I'll go get some popcorn.

Ok, I'm back, read it to the end. Looks like uchar would work here. Certainly that's what we intend when indexing those arrays. Hmmm, actually we promote to signed int before indexing. Interesting. I think you intend to operate over e.g. 8-bit clean Latin1 text, with no negatives.

Thank you for those nice size_t declarations we see all over the place, they look great.

 struct ac_node *suffix_link, *dict_suffix_link;

And thank you for those names. Following the terminology of the cited reference makes it much easier to trace the correspondence between it and the implementation.

success check

ac_node_t *new_node(ac_node_t *parent, char val)
{
 ac_node_t *node = calloc(1, sizeof(ac_node_t));
 node->parent = parent;

Sigh, UB! You can save it with a safe_calloc macro definition which tests for NULL and exits the process upon failure.

Similarly for the enqueue and add_match calls to malloc().

braces

There's a giant literature on how

 if (c)
 single_statement();

leads to maintenance trouble down the road. Consider adopting a linter / pretty-printer / style guide which arranges for { } braces even in the single-statement case.

Even if it's fine in your code, you may find it's a good habit for when you collaborate with others.

(And now I see why macros such as LIST_INSERT_HEAD need to have do ... while(0) on the inside.)

test suite

The unit test is very nice, and self-evaluating so it would report failure. But there's only one. I was kind of hoping to see some giant examples and maybe some benchmark timings, or some counter instrumentation. I mean, that was a lot of work, so now it's time to show off, right?

Here's one to consider. Grab a copy of /usr/share/dict/words as the haystack and cycle through a couple of word sets to grep from it. Choose words long enough to have only one or a handful of matches. Maybe one set has 2 words and the other has 20. Search for matches, reporting elapsed time and perhaps counter values. Since the size of the words file dominates, ideally we see very similar elapsed times across the runs.

memory footprint

Each ac_node is on the large side, containing 256 64-bit pointers. We have sparsity; the vast majority of those pointers will be NULL. Certainly indexing the children array is very fast.

Consider linking against a library that supplies a hash table, and turning index operations into hashmap operations. This may make the program more cache friendly, and hence faster. Benchmark and see!

This codebase achieves its design goals.

I would be willing to delegate or accept maintenance tasks on it.

Question 8

About the hash table: one could use the POSIX hcreate()/hsearch()/hdestroy() functions.

Question 9

To add to @J_H's good points:

Only include what's required:

In "aho_corasick.h", <stddef.h> should suffice for the definition of size_t. You do not need to include <stdlib.h> for it.

Use `static` keyword for array parameters:

In C99 and above, you can do array[static size] in function parameters to specify that array must have at least size items.

A compiler has then the right to assume that argument is not nullptr and therefore it could perform some optimizations.

Moreover, this also documents the function better. As the array must have at least size items, it also means that a nullptr is not a valid value for this argument.

It also eliminates the need of having to document that the function would invoke undefined behavior if nullptrs are passed in, or if the function was testing if any parameter is nullptr, it eliminates onw or more branches.

After modifying one function prototype, this is how it looks:

// ac_automaton_t new_automaton(const char **words, size_t wc);
ac_automaton_t new_automaton(size_t wc, const char *words[static wc]);

Now for these calls:

new_automaton(10, nullptr);
static const char *const words[5] = {};
new_automaton(10, words);
char *p = nullptr;
new_automaton(102, p);

GCC 14.1 detects all 3 wrong use cases and raises diagnostic information, whilst Clang 18.1.0 and x86-64 icx 202400 only detect the first one. A win for GCC.

MSVC does not support this C99 feature (who is suprised?), but MinGW gcc 13.1.0 and MinGW clang 16.0.2 have the same output as GCC and Clang.

Note that this feature is not usable for return values, local variables, or for functions like memcpy() because arrays of void are illegal. A GCC extension, __attribute__((nonnull)) allows parameters to be marked as nonnull. This extension is also supported by Intel's compiler and Clang. A Clang extension allows _Nullable, _Nonnull and _Null_unspecified. This extension is not supported by GCC. A GCC extension, __attribute__((returns_nonnull)) can be used to specify that a function returns a nonnull value. This extension is also supported by Clang and Intel's compiler. I await an _Optional type in the next revision of the C Standard. There are already some proposals for it.

Aside: In C23, you can replace the ASCII_CHAR_COUNT macro with:

static constexpr unsigned int ASCII_CHAR_COUNT = 256;
struct ac_node *children[ASCII_CHAR_COUNT];

Potential Undefined Behavior:

ac_node_t *node = calloc(1, sizeof(ac_node_t));
node->parent = parent;

There's no check for calloc() failure. malloc() and family returns a nullptr on failure. Failing to check for it risks invoking undefined behavior by a subsequent nullptr dereference.

Change it to:

ac_node_t *node = calloc(1, sizeof *node);
if (!node) { // Or, node == nullptr or node == NULL
 // Complain 
 TDB_code();
}

And similarly for the other calls to malloc() in the other functions.

Note that I have changed sizeof (ac_node_t) to sizeof *node. In the case that node was changed to some other type, you would not require a similar change in the calloc() call. This eases maintainence.

Portability:

From the Linux man page:

HISTORY top
 <sys/queue.h> macros first appeared in 4.4BSD.
NOTES top
 Some BSDs provide SIMPLEQ instead of STAILQ. They are identical,
 but for historical reasons they were named differently on
 different BSDs. STAILQ originated on FreeBSD, and SIMPLEQ
 originated on NetBSD. For compatibility reasons, some systems
 provide both sets of macros. glibc provides both STAILQ and
 SIMPLEQ, which are identical except for a missing SIMPLEQ
 equivalent to STAILQ_CONCAT().

It should be enough to define _BSD_SOURCE for sys/queue.h. Compiling with -std=gnu17 should not be necessary
Some compile-time detection is required to use the proper macros provided by an implementation. As the man page says, some BSDs provide SIMPLEQ instead of STAILQ.

To add to @J_H's "braces" point, see: Apple's SSL/TLS bug.

 // Insert words
 for (size_t i = 0; i < wc; ++i)
 {
 const char *w = words[i];
 insert_word(root, w);
 }

And do not comment the obvious. Though I don't see why you need the extra variable w, insert_word(root, words[i]) should suffice. And if you go to Reddit, @skeeto might see your post and teach you a thing or two about fuzz testing with AFL++ (the author of this review is not familiar with the API yet).

I'd also merge these two assertions:

match_t *m = LIST_FIRST(head);
assert(m->start == 0);
assert(m->length == 1);

into one.

When you add "benchmark timings, or some counter instrumentation", like @J_H suggested, compile with -O2/-O3.

I'd also add .DELETE_ON_ERROR to the Makefile. According to GNU make's documentation:

If .DELETE_ON_ERROR is mentioned as a target anywhere in the makefile, then make will delete the target of a rule if it has changed and its recipe exits with a nonzero exit status, just as it does when it receives a signal.

Question 10

"Only include what's required:" --> regarding standard headers: don't sweat extra headers. Some IDE's do have an automatic header includer to get just the right ones - so great, use that if available. But insuring just the right minimum set of standard header manually is not worth a coder's valuable time. In fact, including extra headers does have at least this benefit: Making sure this .c file's namespace does not collide with others.

J_H 42.3k3 gold badges38 silver badges157 bronze badges · Accepted Answer · 2024-05-29 02:49:48Z

references

Woo hoo! You cite a reference for the underlying algorithm. Excellent.

In the header comments for match(), nit: Minor grammar / typos.

includes

#include <stdlib.h>
#include <sys/queue.h>
#include <stdbool.h>
#include "aho_corasick.h"

IDK, maybe elide the first two? Given that your header will pull them in? Or maybe there's some common wisdom about this sort of thing, like keeping the first two makes certain static analysis easier, so I should just stay mum.

EDIT: @Harith offers this sage advice about being robust to unrelated file edits:

the source files should not depend on their corresponding header files to include their dependencies, neither should the header file include what it doesn't require.

Also, thank you for being explicit about the "no unicode!" assumption, whew!, what a relief.

OIC, the ASCII_CHAR_COUNT definition makes it very easy to e.g. impose a 7-bit US-ASCII restriction on inputs, immediately cutting memory use in approximately half. Nice.

negative char

struct ac_node
{
 char val;

Hmmm, signed char, this will be exciting. I'll go get some popcorn.

Ok, I'm back, read it to the end. Looks like uchar would work here. Certainly that's what we intend when indexing those arrays. Hmmm, actually we promote to signed int before indexing. Interesting. I think you intend to operate over e.g. 8-bit clean Latin1 text, with no negatives.

Thank you for those nice size_t declarations we see all over the place, they look great.

 struct ac_node *suffix_link, *dict_suffix_link;

And thank you for those names. Following the terminology of the cited reference makes it much easier to trace the correspondence between it and the implementation.

success check

ac_node_t *new_node(ac_node_t *parent, char val)
{
 ac_node_t *node = calloc(1, sizeof(ac_node_t));
 node->parent = parent;

Sigh, UB! You can save it with a safe_calloc macro definition which tests for NULL and exits the process upon failure.

Similarly for the enqueue and add_match calls to malloc().

braces

There's a giant literature on how

 if (c)
 single_statement();

leads to maintenance trouble down the road. Consider adopting a linter / pretty-printer / style guide which arranges for { } braces even in the single-statement case.

Even if it's fine in your code, you may find it's a good habit for when you collaborate with others.

(And now I see why macros such as LIST_INSERT_HEAD need to have do ... while(0) on the inside.)

test suite

The unit test is very nice, and self-evaluating so it would report failure. But there's only one. I was kind of hoping to see some giant examples and maybe some benchmark timings, or some counter instrumentation. I mean, that was a lot of work, so now it's time to show off, right?

Here's one to consider. Grab a copy of /usr/share/dict/words as the haystack and cycle through a couple of word sets to grep from it. Choose words long enough to have only one or a handful of matches. Maybe one set has 2 words and the other has 20. Search for matches, reporting elapsed time and perhaps counter values. Since the size of the words file dominates, ideally we see very similar elapsed times across the runs.

memory footprint

Each ac_node is on the large side, containing 256 64-bit pointers. We have sparsity; the vast majority of those pointers will be NULL. Certainly indexing the children array is very fast.

Consider linking against a library that supplies a hash table, and turning index operations into hashmap operations. This may make the program more cache friendly, and hence faster. Benchmark and see!

This codebase achieves its design goals.

I would be willing to delegate or accept maintenance tasks on it.

About the hash table: one could use the POSIX hcreate()/hsearch()/hdestroy() functions.

Stack Exchange Network

Aho-Corasick algorithm implemented in C

aho_corasick.h/c

test.c

Makefile

Project Layout

2 Answers 2

references

includes

negative char

success check

braces

test suite

memory footprint

Only include what's required:

Use `static` keyword for array parameters:

Potential Undefined Behavior:

Portability:

You must log in to answer this question.

Hot Network Questions

Aho-Corasick algorithm implemented in C

aho_corasick.h/c

test.c

Makefile

Project Layout

2 Answers 2

references

includes

negative char

success check

braces

test suite

memory footprint

Only include what's required:

Use static keyword for array parameters:

Potential Undefined Behavior:

Portability:

You must log in to answer this question.

Related

Hot Network Questions

Use `static` keyword for array parameters: