I've been reimplementing the fgrep utility in C for fun & study, and so had to implement the Aho-Corasick algorithm.
I'm trying to improve my C skills in general, so any and all feedback would be greatly appreciated.
The project is compiled using gcc and gnu17.
Thanks in advance :]
aho_corasick.h/c
/*
* FILE: aho_corasick.h
* ABOUT: Definitions for the aho-corasick string matching algorithm.
* See https://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_algorithm
*/
#ifndef AHO_CORASIC_H
#define AHO_CORASIC_H
#include <stdlib.h>
#include <sys/queue.h>
struct ac_node;
typedef struct ac_node ac_node_t;
typedef struct ac_automaton {
ac_node_t *root;
} ac_automaton_t;
/**
* Creates a new automaton used in string matching.
* @param words Dictionary of words to be matched against.
* @param wc Number of words in words.
* @return An automaton build from the dictionary.
*/
ac_automaton_t new_automaton(const char **words, size_t wc);
/**
* Frees the memory allocated by the automaton.
* @param automaton Automaton to be deleted.
*/
void delete_automaton(ac_automaton_t automaton);
typedef struct match
{
size_t start, length;
LIST_ENTRY(match) entries;
} match_t;
typedef LIST_HEAD(match_list_head, match) match_list_head_t;
/**
* Frees the memory allocated by the match-list.
* @param head Head of the match-list to be deleted.
*/
void delete_match_list(match_list_head_t *head);
/**
* Finds all substring matching an of the automatons dictionary.
* @param str String to be searched.
* @param automaton Automaton used in matching.
* @return A list of matches in order of appearence in str.
*/
match_list_head_t *match(const char *str, const ac_automaton_t automaton);
#endif
/*
* FILE: aho_corasick.c
* ABOUT: Implementation of the aho-corasick string matching algorithm defined in aho_corasick.h.
* See https://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_algorithm
*/
#include <stdlib.h>
#include <sys/queue.h>
#include <stdbool.h>
#include "aho_corasick.h"
#define ASCII_CHAR_COUNT 256 // No. of valid (extended) ASCII characters
struct ac_node
{
char val;
bool is_final;
size_t length;
struct ac_node *parent;
struct ac_node *children[ASCII_CHAR_COUNT];
struct ac_node *suffix_link, *dict_suffix_link;
};
ac_node_t *new_node(ac_node_t *parent, char val)
{
ac_node_t *node = calloc(1, sizeof(ac_node_t));
node->parent = parent;
node->val = val;
node->length = parent == NULL
? 0
: parent->length + 1;
return node;
}
typedef struct node_q_entry
{
ac_node_t *node;
STAILQ_ENTRY(node_q_entry) entries;
} node_q_entry_t;
typedef STAILQ_HEAD(node_q_head, node_q_entry) node_q_head_t;
static void enqueue(node_q_head_t *head, ac_node_t *node)
{
node_q_entry_t *entry = malloc(sizeof(node_q_entry_t));
entry->node = node;
STAILQ_INSERT_TAIL(head, entry, entries);
}
static ac_node_t *dequeue(node_q_head_t *head)
{
node_q_entry_t *entry = STAILQ_FIRST(head);
STAILQ_REMOVE_HEAD(head, entries);
ac_node_t *node = entry->node;
free(entry);
return node;
}
static void insert_word(ac_node_t *root, const char *word)
{
ac_node_t *node = root;
while (*word != '0円')
{
int c = *word;
if (node->children[c] == NULL)
node->children[c] = new_node(node, c);
node = node->children[c];
++word;
}
node->is_final = true;
}
static bool is_root(const ac_node_t *node)
{
return node->parent == NULL;
}
static ac_node_t *find_suffix_link(ac_node_t *node)
{
if (is_root(node)) return NULL;
if (is_root(node->parent)) return node->parent;
int val = node->val;
node = node->parent->suffix_link;
while (node->children[val] == NULL)
{
if (is_root(node)) return node;
node = node->parent;
}
return node->children[val];
}
static ac_node_t *find_dict_suffix_link(ac_node_t *node)
{
node = node->suffix_link;
while (node != NULL)
{
if (node->is_final) return node;
node = node->suffix_link;
}
return NULL;
}
ac_automaton_t new_automaton(const char **words, size_t wc)
{
ac_node_t *root = new_node(NULL, '0円');
// Insert words
for (size_t i = 0; i < wc; ++i)
{
const char *w = words[i];
insert_word(root, w);
}
// Add (dict) suffix links using breadth-first search
node_q_head_t head;
STAILQ_INIT(&head);
enqueue(&head, root);
while (!STAILQ_EMPTY(&head))
{
ac_node_t *node = dequeue(&head);
node->suffix_link = find_suffix_link(node);
node->dict_suffix_link = find_dict_suffix_link(node);
for (int i = 0; i < ASCII_CHAR_COUNT; ++i)
{
ac_node_t *child = node->children[i];
if (child != NULL) enqueue(&head, child);
}
}
return (ac_automaton_t) { .root = root };
}
void delete_automaton(ac_automaton_t automaton)
{
node_q_head_t head;
STAILQ_INIT(&head);
enqueue(&head, automaton.root);
while (!STAILQ_EMPTY(&head))
{
ac_node_t *node = dequeue(&head);
for (int i = 0; i < ASCII_CHAR_COUNT; ++i)
{
ac_node_t *child = node->children[i];
if (child != NULL) enqueue(&head, child);
}
free(node);
}
}
void delete_match_list(match_list_head_t *head)
{
match_t *m1 = LIST_FIRST(head);
while (m1 != NULL) {
match_t *m2 = LIST_NEXT(m1, entries);
free(m1);
m1 = m2;
}
LIST_INIT(head);
}
static void add_match(match_list_head_t *head, size_t pos, size_t length)
{
static match_t *last_match = NULL;
match_t *m = malloc(sizeof(match_t));
m->start = pos - length + 1;
m->length = length;
if (last_match == NULL)
LIST_INSERT_HEAD(head, m, entries);
else
LIST_INSERT_AFTER(last_match, m, entries);
last_match = m;
}
match_list_head_t *match(const char *str, const ac_automaton_t automaton)
{
match_list_head_t *head = malloc(sizeof(match_list_head_t));
LIST_INIT(head);
ac_node_t *node = automaton.root;
size_t pos = 0;
for (int c = *str; c != '0円'; c = *(++str))
{
// Follow children & suffix links
while (node->children[c] == NULL)
{
node = node->suffix_link;
if (node == NULL) break;
}
node = node == NULL
? automaton.root
: node->children[c];
// Look for matches
if (node->is_final)
add_match(head, pos, node->length);
ac_node_t *dict_node = node->dict_suffix_link;
while (dict_node != NULL)
{
add_match(head, pos, dict_node->length);
dict_node = dict_node->dict_suffix_link;
}
++pos;
}
return head;
}
test.c
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <sys/queue.h>
#include "src/aho_corasick.h"
int main(void)
{
// wikipedia examples
const char* wiki_words[] = {
"a",
"ab",
"bab",
"bc",
"bca",
"c",
"caa",
};
size_t wiki_wc = sizeof(wiki_words)/sizeof(wiki_words[0]);
ac_automaton_t wiki_automaton = new_automaton(wiki_words, wiki_wc);
match_list_head_t *head = match("abccab", wiki_automaton);
match_t *m = LIST_FIRST(head);
assert(m->start == 0);
assert(m->length == 1);
m = LIST_NEXT(m, entries);
assert(m->start == 0);
assert(m->length == 2);
m = LIST_NEXT(m, entries);
assert(m->start == 1);
assert(m->length == 2);
m = LIST_NEXT(m, entries);
assert(m->start == 2);
assert(m->length == 1);
m = LIST_NEXT(m, entries);
assert(m->start == 3);
assert(m->length == 1);
m = LIST_NEXT(m, entries);
assert(m->start == 4);
assert(m->length == 1);
m = LIST_NEXT(m, entries);
assert(m->start == 4);
assert(m->length == 2);
m = LIST_NEXT(m, entries);
assert(m == NULL);
delete_match_list(head);
assert(LIST_EMPTY(head));
delete_automaton(wiki_automaton);
puts("Tests ran succesfully!");
return EXIT_SUCCESS;
}
Makefile
SRC_DIR := ./src
OBJ_DIR := ./obj
BIN_DIR := ./bin
CC := gcc
CFLAGS := -Wall -ggdb
$(OBJ_DIR)/aho_corasick.o: $(SRC_DIR)/aho_corasick.h $(SRC_DIR)/aho_corasick.c
@mkdir -p $(OBJ_DIR)
$(CC) -c -o $(OBJ_DIR)/aho_corasick.o $(SRC_DIR)/aho_corasick.c $(CFLAGS)
.PHONY: test clean
test: $(OBJ_DIR)/aho_corasick.o
@mkdir -p $(BIN_DIR)
$(CC) -o $(BIN_DIR)/test test.c $(OBJ_DIR)/* $(CFLAGS)
$(BIN_DIR)/test
clean:
rm -rf $(BIN_DIR) $(OBJ_DIR)
Project Layout
.
├── Makefile
├── src
│ ├── aho_corasick.c
│ └── aho_corasick.h
└── test.c
2 Answers 2
references
Woo hoo! You cite a reference for the underlying algorithm. Excellent.
In the header comments for match(), nit: Minor grammar / typos.
includes
#include <stdlib.h>
#include <sys/queue.h>
#include <stdbool.h>
#include "aho_corasick.h"
IDK, maybe elide the first two? Given that your header will pull them in? Or maybe there's some common wisdom about this sort of thing, like keeping the first two makes certain static analysis easier, so I should just stay mum.
EDIT: @Harith offers this sage advice about being robust to unrelated file edits:
the source files should not depend on their corresponding header files to include their dependencies, neither should the header file include what it doesn't require.
Also, thank you for being explicit about the "no unicode!" assumption, whew!, what a relief.
OIC, the ASCII_CHAR_COUNT definition makes it very easy to e.g. impose a 7-bit US-ASCII restriction on inputs, immediately cutting memory use in approximately half. Nice.
negative char
struct ac_node
{
char val;
Hmmm, signed char, this will be exciting. I'll go get some popcorn.
Ok, I'm back, read it to the end. Looks like uchar would work here.
Certainly that's what we intend when indexing those arrays.
Hmmm, actually we promote to signed int before indexing. Interesting.
I think you intend to operate over e.g. 8-bit clean Latin1 text,
with no negatives.
Thank you for those nice size_t declarations we see all over the place,
they look great.
struct ac_node *suffix_link, *dict_suffix_link;
And thank you for those names. Following the terminology of the cited reference makes it much easier to trace the correspondence between it and the implementation.
success check
ac_node_t *new_node(ac_node_t *parent, char val)
{
ac_node_t *node = calloc(1, sizeof(ac_node_t));
node->parent = parent;
Sigh, UB!
You can save it with a safe_calloc macro definition
which tests for NULL and exits the process upon failure.
Similarly for the enqueue and add_match calls to malloc().
braces
There's a giant literature on how
if (c)
single_statement();
leads to maintenance trouble down the road.
Consider adopting a linter / pretty-printer / style guide
which arranges for { } braces even in the single-statement case.
Even if it's fine in your code, you may find it's a good habit for when you collaborate with others.
(And now I see why macros such as LIST_INSERT_HEAD need to
have do ... while(0) on the inside.)
test suite
The unit test is very nice, and self-evaluating so it would report failure. But there's only one. I was kind of hoping to see some giant examples and maybe some benchmark timings, or some counter instrumentation. I mean, that was a lot of work, so now it's time to show off, right?
Here's one to consider.
Grab a copy of /usr/share/dict/words as the haystack and
cycle through a couple of word sets to grep from it.
Choose words long enough to have only one or a handful of matches.
Maybe one set has 2 words and the other has 20.
Search for matches, reporting elapsed time and perhaps counter values.
Since the size of the words file dominates,
ideally we see very similar elapsed times across the runs.
memory footprint
Each ac_node is on the large side,
containing 256 64-bit pointers.
We have sparsity; the vast majority of those pointers will be NULL.
Certainly indexing the children array is very fast.
Consider linking against a library that supplies a hash table, and turning index operations into hashmap operations. This may make the program more cache friendly, and hence faster. Benchmark and see!
This codebase achieves its design goals.
I would be willing to delegate or accept maintenance tasks on it.
-
3\$\begingroup\$ About the hash table: one could use the POSIX
hcreate()/hsearch()/hdestroy()functions. \$\endgroup\$G. Sliepen– G. Sliepen2024年05月29日 20:11:04 +00:00Commented May 29, 2024 at 20:11
To add to @J_H's good points:
Only include what's required:
In "aho_corasick.h", <stddef.h> should suffice for the definition of size_t. You do not need to include <stdlib.h> for it.
Use static keyword for array parameters:
In C99 and above, you can do array[static size] in function parameters to specify that array must have at least size items.
A compiler has then the right to assume that argument is not nullptr and therefore it could perform some optimizations.
Moreover, this also documents the function better. As the array must have at least size items, it also means that a nullptr is not a valid value for this argument.
It also eliminates the need of having to document that the function would invoke undefined behavior if nullptrs are passed in, or if the function was testing if any parameter is nullptr, it eliminates onw or more branches.
After modifying one function prototype, this is how it looks:
// ac_automaton_t new_automaton(const char **words, size_t wc);
ac_automaton_t new_automaton(size_t wc, const char *words[static wc]);
Now for these calls:
new_automaton(10, nullptr);
static const char *const words[5] = {};
new_automaton(10, words);
char *p = nullptr;
new_automaton(102, p);
GCC 14.1 detects all 3 wrong use cases and raises diagnostic information, whilst Clang 18.1.0 and x86-64 icx 202400 only detect the first one. A win for GCC.
MSVC does not support this C99 feature (who is suprised?), but MinGW gcc 13.1.0 and MinGW clang 16.0.2 have the same output as GCC and Clang.
Note that this feature is not usable for return values, local variables, or for functions like memcpy() because arrays of void are illegal. A GCC extension, __attribute__((nonnull)) allows parameters to be marked as nonnull. This extension is also supported by Intel's compiler and Clang. A Clang extension allows _Nullable, _Nonnull and _Null_unspecified. This extension is not supported by GCC. A GCC extension, __attribute__((returns_nonnull)) can be used to specify that a function returns a nonnull value. This extension is also supported by Clang and Intel's compiler. I await an _Optional type in the next revision of the C Standard. There are already some proposals for it.
Aside: In C23, you can replace the ASCII_CHAR_COUNT macro with:
static constexpr unsigned int ASCII_CHAR_COUNT = 256;
struct ac_node *children[ASCII_CHAR_COUNT];
Potential Undefined Behavior:
ac_node_t *node = calloc(1, sizeof(ac_node_t));
node->parent = parent;
There's no check for calloc() failure. malloc() and family returns a nullptr on failure. Failing to check for it risks invoking undefined behavior by a subsequent nullptr dereference.
Change it to:
ac_node_t *node = calloc(1, sizeof *node);
if (!node) { // Or, node == nullptr or node == NULL
// Complain
TDB_code();
}
And similarly for the other calls to malloc() in the other functions.
Note that I have changed sizeof (ac_node_t) to sizeof *node. In the case that node was changed to some other type, you would not require a similar change in the calloc() call. This eases maintainence.
Portability:
From the Linux man page:
HISTORY top
<sys/queue.h> macros first appeared in 4.4BSD.
NOTES top
Some BSDs provide SIMPLEQ instead of STAILQ. They are identical,
but for historical reasons they were named differently on
different BSDs. STAILQ originated on FreeBSD, and SIMPLEQ
originated on NetBSD. For compatibility reasons, some systems
provide both sets of macros. glibc provides both STAILQ and
SIMPLEQ, which are identical except for a missing SIMPLEQ
equivalent to STAILQ_CONCAT().
It should be enough to define
_BSD_SOURCEforsys/queue.h. Compiling with-std=gnu17should not be necessarySome compile-time detection is required to use the proper macros provided by an implementation. As the man page says, some BSDs provide
SIMPLEQinstead ofSTAILQ.
To add to @J_H's "braces" point, see: Apple's SSL/TLS bug.
// Insert words
for (size_t i = 0; i < wc; ++i)
{
const char *w = words[i];
insert_word(root, w);
}
And do not comment the obvious. Though I don't see why you need the extra variable w, insert_word(root, words[i]) should suffice. And if you go to Reddit, @skeeto might see your post and teach you a thing or two about fuzz testing with AFL++ (the author of this review is not familiar with the API yet).
I'd also merge these two assertions:
match_t *m = LIST_FIRST(head);
assert(m->start == 0);
assert(m->length == 1);
into one.
When you add "benchmark timings, or some counter instrumentation", like @J_H suggested, compile with -O2/-O3.
I'd also add .DELETE_ON_ERROR to the Makefile. According to GNU make's documentation:
If
.DELETE_ON_ERRORis mentioned as a target anywhere in the makefile, then make will delete the target of a rule if it has changed and its recipe exits with a nonzero exit status, just as it does when it receives a signal.
-
\$\begingroup\$ "Only include what's required:" --> regarding standard headers: don't sweat extra headers. Some IDE's do have an automatic header includer to get just the right ones - so great, use that if available. But insuring just the right minimum set of standard header manually is not worth a coder's valuable time. In fact, including extra headers does have at least this benefit: Making sure this .c file's namespace does not collide with others. \$\endgroup\$chux– chux2024年05月29日 13:46:36 +00:00Commented May 29, 2024 at 13:46
You must log in to answer this question.
Explore related questions
See similar questions with these tags.
calloc()/malloc()failure properly. Review later. Kindly specify whether you're using C99, C11, C17, or C23. \$\endgroup\$truewithout including<stdbool.h>. Are you perchance not compiling with warnings enabled, or compiling withstd=c2x/c23/gnu2x/gnu23, or ignoring the warnings? Can you include the contents of the Makefile? \$\endgroup\$sys/queue.halbeit, and doesn't know abouttrue,false, andboolunless you incude<stdbool.h>. The Linux man page states thatsys/queue.his part of the BSD Standard, so not even POSIX. \$\endgroup\$gcc 11.3.0which should default tognu17, sorry for the confusion 🤦).stdboolshould have been imported at the beginning ofaho_corasick.c. \$\endgroup\$