I have a following exercise from ANSI C book:
Exercise 6.1. Our version of getword does not properly handle underscores, string constants, comments, or preprocessor control lines. Write a better version.
Here's my improved version of getword function and code that test if it works:
#include <ctype.h>
#include <stdbool.h>
#include <stdio.h>
#include <string.h>
#define BUFSIZE 100
#define MAXWORD 100
#define NKEYS (sizeof keytab / sizeof(keytab[0]))
int buf[BUFSIZE];
int bufp = 0;
struct key {
char * word;
int count;
} keytab[] = {
{ "#define", 0 },
{ "#elif", 0 },
{ "#else", 0 },
{ "#endif", 0 },
{ "#error", 0 },
{ "#if", 0 },
{ "#ifdef", 0 },
{ "#ifndef", 0 },
{ "#include", 0 },
{ "#line", 0 },
{ "#pragma", 0 },
{ "auto", 0 },
{ "break", 0 },
{ "case", 0 },
{ "char", 0 },
{ "const", 0 },
{ "continue", 0 },
{ "default", 0 },
{ "do", 0 },
{ "double", 0 },
{ "else", 0 },
{ "enum", 0 },
{ "extern", 0 },
{ "float", 0 },
{ "for", 0 },
{ "goto", 0 },
{ "if", 0 },
{ "int", 0 },
{ "long", 0 },
{ "register", 0 },
{ "return", 0 },
{ "short", 0 },
{ "signed", 0 },
{ "sizeof", 0 },
{ "static", 0 },
{ "struct", 0 },
{ "switch", 0 },
{ "typedef", 0 },
{ "union", 0 },
{ "unsigned", 0 },
{ "void", 0 },
{ "volatile", 0 },
{ "while", 0 },
};
int getword(char *, int);
int binsearch(char *, struct key *, int);
int getch(void);
void ungetch(int);
int main(void) {
int n, c;
char word[MAXWORD];
while ((c = getword(word, MAXWORD)) != EOF) {
// printf("getword(word, MAXWORD) = %c %d\n", c, c);
// printf("word = %s\n", word);
if (isalpha(word[0]) || word[0] == '_' || word[0] == '#') {
if ((n = binsearch(word, keytab, NKEYS)) >= 0) {
keytab[n].count++;
}
}
}
for (n = 0; n < NKEYS; n++)
if (keytab[n].count > 0)
printf("%4d %s\n",
keytab[n].count, keytab[n].word);
return 0;
}
int getch(void) {
return (bufp > 0) ? buf[--bufp] : getchar();
}
void ungetch(int c) {
if (bufp >= BUFSIZE)
printf("ungetch: too many characters\n");
else
buf[bufp++] = c;
}
int getword(char * word, int lim) {
int c;
char * w = word;
static int last;
while (isblank(c = getch()))
;
if (c != EOF)
*w++ = c;
if (last == '/' && c == '/') {
while (c = getch() != '\n')
;
return c;
}
if (last == '/' && c == '*') {
x:
while (c = getch() != '*')
;
if (c = getch() == '/')
return c;
else
goto x;
}
if (c == '\'') {
while (c = getch() != '\'')
;
return c;
}
if (c == '\"') {
while (c = getch() != '\"')
;
return c;
}
if (!isalpha(c) && c != '_' && c != '#') {
*w = '0円';
last = c;
return c;
}
for ( ; --lim > 0; w++) {
if (!isalnum(*w = getch()) && *w != '_') {
ungetch(*w);
break;
}
}
*w = '0円';
last = word[0];
return word[0];
}
int binsearch(char * word, struct key tab[], int n) {
int cond;
int low, high, mid;
low = 0;
high = n - 1;
while (low <= high) {
mid = (low + high) / 2;
if ((cond = strcmp(word, tab[mid].word)) < 0)
high = mid - 1;
else if (cond > 0)
low = mid + 1;
else
return mid;
}
return -1;
}
What are your opinions about my solution? Is it a correct approach? Is this code enough fast?
2 Answers 2
The
last
logic is quite hard to trace, and it is prone to bugs. Consider/* ... */*
. Once the code reached the closing/
, it returns, butlast
is not updated, and remains/
, so the next*
is still treated as a beginning of a comment.Handling of string constants has a definite problem: it doesn't account for an escaped quote char in the middle of the string (e.g.
"aaa\"bbb"
).
Combining the two above observations, I strongly recommend to revisit the design, and realize few more functions, such as get_line_comment
, get_c_comment
, get_quoted_string
etc.
No word in
keytab
starts with an underscore. You don't need to go intobinsearch
in that case.The preprocessor directive may not appear in the middle of the line. I am not sure if a directive in such case should be counted.
Please don't do this:
struct key {
char * word;
int count;
} keytab[] = {
{ "#define", 0 },
{ "#elif", 0 },
...
Separate out the definition of the struct
from the declaration of the variable. This is very hard to read, unexpected, and smacks of cleverness for cleverness' sake. It also saves you all of maybe 10-15 characters at the cost of readability.