This is an implementation of a trie, that I will be using on a code editor to parse keywords from a given programming language. I decided to use a trie when I asked this question, and the implementation seems to perform pretty well and without errors from valgrind/memcheck.
I want to make sure I'm squeezing every bit of performance out of this thing, though. So I want to know if there's anything more I can do to improve performance (I guess that makes this a performance-related review). Are there any performance tips?
While I'm not specifically looking for a review of the coding style, if I'm doing anything weird please point it out.
Implementation:
Header:
// trie.h
#include <stdlib.h>
#define TRIE_SIZE 256 // would be different for unicode
struct trie_node {
int class; /* acts as a class or id of the word
* and used for checking for a match
*/
struct trie_node * nodes[TRIE_SIZE];
};
struct trie {
struct trie_node head, * current;
};
int trie_init (struct trie * t);
int trie_adds (struct trie_node * head, char * str, int class);
int trie_freenode (struct trie_node * node);
int trie_free (struct trie * t);
int trie_pass (struct trie * t, unsigned char c);
int trie_reset (struct trie * t);
Implementation:
#include "trie.h"
#include <stdlib.h>
int trie_init (struct trie * t){
t->current = &t->head;
for (int i = 0; i < TRIE_SIZE; i++){
t->head.nodes[i] = 0;
}
t->head.class = 0;
return 0;
}
int trie_adds (struct trie_node * head, char * str, int class){
if (str[0]){
int num = str[0];
if (!head->nodes[num]){ // create new node
head->nodes[num] = calloc (1, sizeof (struct trie_node));
if (!head->nodes[num]) return -1;
}
if (str[1]) return trie_adds (head->nodes[num], &str[1], class);
else head->nodes[num]->class = class;
}
return 0;
}
// used in trie_free
static int trie_freenode (struct trie_node * node){
if (!node || !node->nodes){
return -1;
}
for (int i = 0; i < TRIE_SIZE; i++){
if (node->nodes[i]){
trie_freenode (node->nodes[i]);
free (node->nodes[i]);
}
}
return 0;
}
int trie_free (struct trie * t){
for (int i = 0; i < TRIE_SIZE; i++){
if (t->head.nodes[i]){
trie_freenode (t->head.nodes[i]);
free (t->head.nodes[i]);
t->head.nodes[i] = 0;
}
}
t->current = &t->head;
return 0;
}
int trie_pass (struct trie * t, unsigned char c){
t->current = t->current ? t->current->nodes[c]) : 0;
return t->current ? t->current->state : 0;
}
int trie_reset (struct trie * t){
t->current = &t->head;
return 0;
}
Comparative test program (no trie):
#include <stdio.h>
#include <stdlib.h>
char * gettestfile (char * filename);
int strcmp2 (char * str1, char * str2);
#define isqualifier(c)\
(((c >= 'a') && (c <= 'z')) || ((c >= 'A') && (c <= 'Z')))
int main (int argc, char * argv[]){
if (argc < 3){
printf ("usage %s filename [words]\n", argv[0]);
return -1;
}
char * buffer = gettestfile (argv[1]);
if (!buffer){
return -1;
}
int nmatches = 0;
for (int i = 0; buffer[i];){
while (buffer[i] && !isqualifier (buffer[i])) i++;
if (!buffer[i]) break;
int i2 = 0;
for (int j = 2; j < argc; j++){
i2 = strcmp2 (&buffer[i], argv[j]);
if (i2 > 0) break;
}
if (i2 > 0){
i += i2;
nmatches++;
} else while (buffer[i] && isqualifier (buffer[i])) i++;
}
printf ("found %d matches\n", nmatches);
free (buffer);
return 0;
}
int strcmp2 (char * str1, char * str2){
int i = 0;
for (; str1[i] && str2[i]; i++){
if (str1[i] != str2[i]){
break;
}
}
return (str2[i] || isqualifier (str1[i])) ? 0 : i;
}
char * gettestfile (char * filename){
char * buffer = 0;
int len = 0;
FILE * file = fopen (filename, "r");
if (!file){
return 0;
}
fseek(file, 0L, SEEK_END);
len = ftell(file);
fseek(file, 0L, SEEK_SET);
buffer = calloc (len + 1, 1);
if (!buffer){
fclose (file);
return 0;
}
fread (buffer, 1, len, file);
fclose (file);
return buffer;
}
Comparative test program (with trie):
#include <stdio.h>
#include <stdlib.h>
#include "trie.h"
#define isqualifier(c)\
(((c >= 'a') && (c <= 'z')) || ((c >= 'A') && (c <= 'Z')))
char * gettestfile (char *);
int main (int argc, char * argv[]){
char * buffer = 0;
struct trie t;
if (argc < 3){
printf ("usage: %s filename [words]\n", argv[0]);
return -1;
}
trie_init (&t);
for (int i = 2; i < argc; i++){
trie_adds (&t.head, argv[i], 1);
}
buffer = gettestfile (argv[1]);
int nmatches = 0;
if (buffer){
for (int i = 0; buffer[i]; i++){
char c = buffer[i];
if (isqualifier (c)){
trie_pass (&t, buffer[i]);
} else {
if (t.current && t.current->class){
nmatches++;
}
trie_reset (&t);
}
}
free (buffer);
}
printf ("found %d matches\n", nmatches);
trie_free (&t);
return 0;
}
char * gettestfile (char * filename){
char * buffer = 0;
int len = 0;
FILE * file = fopen (filename, "r");
if (!file){
return 0;
}
fseek(file, 0L, SEEK_END);
len = ftell(file);
fseek(file, 0L, SEEK_SET);
buffer = calloc (len + 1, 1);
if (!buffer){
fclose (file);
return 0;
}
fread (buffer, 1, len, file);
fclose (file);
return buffer;
}
Notes about implementation:
isqualifier
is used to check if a character is a qualifying unit of a word. In this case, it is the alphabet (it may also be contain numbers and '_' if it is parsing code). The use of this macro is to eliminate matches like the
within the word there
. If there is a qualifier before or after a two sequences of characters, it is not considered a match.
Performance testing:
I've already timed both of these with a large text file. Here's some of the results:
Test 1:
root@nodeTwo:/usr/local/src/trie# time ./std-test big.txt one
found 3091 matches
real 0m0.058s
user 0m0.052s
sys 0m0.004s
root@nodeTwo:/usr/local/src/trie# time ./trie-test big.txt one
found 3091 matches
real 0m0.068s
user 0m0.064s
sys 0m0.000s
Test 2:
root@nodeTwo:/usr/local/src/trie# time ./std-test big.txt one the here that now next hello for while this
found 100085 matches
real 0m0.137s
user 0m0.132s
sys 0m0.004s
root@nodeTwo:/usr/local/src/trie# time ./trie-test big.txt one the here that now next hello for while this
found 100085 matches
real 0m0.077s
user 0m0.072s
sys 0m0.004s
Test 3:
root@nodeTwo:/usr/local/src/trie# time ./std-test big.txt one the here that now next hello for while this when do something crazy let me know
found 107419 matches
real 0m0.188s
user 0m0.184s
sys 0m0.004s
root@nodeTwo:/usr/local/src/trie# time ./trie-test big.txt one the here that now next hello for while this when do something crazy let me know
found 107419 matches
real 0m0.077s
user 0m0.072s
sys 0m0.004s
Notes about testing:
To perform this test, I downloaded this text file (warning: it's 6.5MB of text) and used it with both programs. If you would like to download the text file and build the source for this review, you can use that link and this link to the GitHub page.
Solutions:
The only way I can think of that may increase performance is by allocating the entire list into continuous virtual memory (would that help get it into CPU Cache?), instead of broken up chunks, and mapping the list to that. I don't know how much to expect this to help, so I'm not sure if I should bother with it or not.
2 Answers 2
It's encouraging that your test indicates that the trie execution time is (almost) independent of the number of keywords.
I think you'll find that isqualifier
is a significant contributor to the execution time, and gets worse as the complexity of the list of qualifying characters grows. For anything more complicated than [A-Za-z], a look-up table will be just as good. Only 256 bytes are needed; if that bothers you, you can actually make it work (for certain definitions of qualifying sets) with a couple of 64-bit constants and some bit-whacking:
// Matches [0-9A-Za-z_]
static const uint64_t low_64 = 0x03FF000000000000UL;
static const uint64_t high_64 = 0x07FFFFFE87FFFFFEUL;
static inline bool qualifies(uint8_t c) {
uint8_t bit7 = (c >> 6);
return ((c >> 7) ^ 1) & (
((low_64 >> (c & 0x7F)) & (bit7 ^ 1) |
((high_64 >> (c & 0x7F)) & bit7));
}
There are lots of variations on the above theme; you'd need to profile. But I wouldn't actually do that with tries, because you can take advantage of the trie itself to discriminate.
The trie is really a simple state machine. With a small modification, we can actually build the precise state machine. Here it is in a simple version, recognizing a
, on
and one
. Note that all the keywords end with a transition to "Accept"; this is really a transition to "Start" which also indicates that the keyword was accepted. (Code below.)
Start: a --> State1
o --> State2
other qualifying --> Skip
non_qualifying --> Start
# Come here after matching `a`
State1: qualifying --> Skip
non_qualifying --> Accept
# Come here after matching `o`
State2: n --> State3
other qualifying --> Skip
non_qualifying --> Start
# Come here after matching `on`
State3: e --> State4
other qualifying --> Skip
non_qualifying --> Accept
# Come here after matching `one`
State4: qualifying --> Skip
non_qualifying --> Accept
# Come here after matching any other word.
Skip: qualifying --> Skip
non_qualifying --> Start
Building this state machine is almost exactly the same as building the trie. The following is based on your trie code:
struct trie_node {
int class;
struct trie_node* nodes[TRIE_SIZE];
};
struct trie {
trie_node skip;
trie_node start;
trie_node* current;
}
int trie_init(struct trie* t) {
for (int c = 0; c < TRIE_SIZE; ++c) {
t->skip[c] = isqualifying(c) ? &t->skip
: &t->start;
}
// Initially, no keywords
t->start = t->skip;
t->current = &t->start;
}
int trie_adds(struct trie* t, const char* s, int class) {
struct trie_node* current = t->start;
for (; *s; ++s) {
uint8_t ch = *s;
if (current[ch] == &t->skip) {
current[ch] = malloc(sizeof *current[ch]);
*current[ch] = t->skip;
}
current = current[ch];
}
current->class = class;
}
static inline int trie_step(struct trie* t, int ch) {
int class = t->current->class;
t->current = t->current.nodes[ch];
return t->current == &t->start ? class : 0;
}
(The above code is completely untested. It probably doesn't even compile.)
With the above, you no longer need to call is_qualifying on every character. You just pass each character in turn -- including the terminating NUL -- through trie_step, and any time it returns a non-zero value, you've found a keyword.
On the assumption that the list of keywords does not change often, the fastest code will be to precompile your trie into a tree of switch statements like this, where p
is the current character pointer, and suppose your keywords are a
, aa
, ab
, b
, ba
and bb
:
switch(p[0]){
break; case 'a': switch(p[1]){
break; case 'a':{
if (!alphanumeric(p[2]){/* found aa */}
}
break; case 'b':{
if (!alphanumeric(p[2]){/* found ab */}
}
break; default: if (!alphanumeric(p[1])){ /* found a */ } else
/* no keyword here */
}
break; case 'b': switch(p[1]){
break; case 'a':{
if (!alphanumeric(p[2]){/* found ba */}
}
break; case 'b':{
if (!alphanumeric(p[2]){/* found bb */}
}
break; default: if (!alphanumeric(p[1])){ /* found b */ } else
/* no keyword here */
}
break; default:
/* no keyword here */
}
This code is tedious to write (!), but you could write a separate program to input the list of keywords and print out this code, which you could then include in your original program.
Why is it fast? No data structure.
When I need speed, I try to use as little data structure as possible, and only to hold information that actually is not known until run time.
-
\$\begingroup\$ I avoided this because I will be using this tree for many coding languages, and writing this kind of code for all of them would take a long time and would be hard to maintain and debug. You're right that it would be faster, but I don't think I can practically use this solution \$\endgroup\$tay10r– tay10r2013年08月31日 03:45:44 +00:00Commented Aug 31, 2013 at 3:45
-
\$\begingroup\$ @Taylor: That's up to you of course, but if the little preprocessing program is written, then it only has to be run once for each language, and a separate executable (or dll) compiled for each language. If you want to have a single executable for all languages, then I would do it the way you are, except I would not use a trie, for space reasons. I would just use binary search as a first cut. I would not assume this is a bottleneck until it actually proves to be one, which I detect by random pausing. \$\endgroup\$Mike Dunlavey– Mike Dunlavey2013年08月31日 14:22:57 +00:00Commented Aug 31, 2013 at 14:22
Explore related questions
See similar questions with these tags.