1
\$\begingroup\$

I have a following exercise from ANSI C book:

Exercise 6.1. Our version of getword does not properly handle underscores, string constants, comments, or preprocessor control lines. Write a better version.

Here's my improved version of getword function and code that test if it works:

#include <ctype.h>
#include <stdbool.h>
#include <stdio.h>
#include <string.h>
#define BUFSIZE 100
#define MAXWORD 100
#define NKEYS (sizeof keytab / sizeof(keytab[0]))
int buf[BUFSIZE];
int bufp = 0;
struct key {
 char * word;
 int count;
} keytab[] = {
 { "#define", 0 },
 { "#elif", 0 },
 { "#else", 0 },
 { "#endif", 0 },
 { "#error", 0 },
 { "#if", 0 },
 { "#ifdef", 0 },
 { "#ifndef", 0 },
 { "#include", 0 },
 { "#line", 0 },
 { "#pragma", 0 },
 { "auto", 0 },
 { "break", 0 },
 { "case", 0 },
 { "char", 0 },
 { "const", 0 },
 { "continue", 0 },
 { "default", 0 },
 { "do", 0 },
 { "double", 0 },
 { "else", 0 },
 { "enum", 0 },
 { "extern", 0 },
 { "float", 0 },
 { "for", 0 },
 { "goto", 0 },
 { "if", 0 },
 { "int", 0 },
 { "long", 0 },
 { "register", 0 },
 { "return", 0 },
 { "short", 0 },
 { "signed", 0 },
 { "sizeof", 0 },
 { "static", 0 },
 { "struct", 0 },
 { "switch", 0 },
 { "typedef", 0 },
 { "union", 0 },
 { "unsigned", 0 },
 { "void", 0 },
 { "volatile", 0 },
 { "while", 0 },
};
int getword(char *, int);
int binsearch(char *, struct key *, int);
int getch(void);
void ungetch(int);
int main(void) {
 int n, c;
 char word[MAXWORD];
 while ((c = getword(word, MAXWORD)) != EOF) {
// printf("getword(word, MAXWORD) = %c %d\n", c, c);
// printf("word = %s\n", word);
 if (isalpha(word[0]) || word[0] == '_' || word[0] == '#') {
 if ((n = binsearch(word, keytab, NKEYS)) >= 0) {
 keytab[n].count++;
 }
 }
 }
 for (n = 0; n < NKEYS; n++)
 if (keytab[n].count > 0)
 printf("%4d %s\n",
 keytab[n].count, keytab[n].word);
 return 0;
}
int getch(void) {
 return (bufp > 0) ? buf[--bufp] : getchar();
}
void ungetch(int c) {
 if (bufp >= BUFSIZE)
 printf("ungetch: too many characters\n");
 else
 buf[bufp++] = c;
}
int getword(char * word, int lim) {
 int c;
 char * w = word;
 static int last;
 while (isblank(c = getch()))
 ;
 if (c != EOF)
 *w++ = c;
 if (last == '/' && c == '/') {
 while (c = getch() != '\n')
 ;
 return c;
 }
 if (last == '/' && c == '*') {
 x:
 while (c = getch() != '*')
 ;
 if (c = getch() == '/')
 return c;
 else
 goto x;
 }
 if (c == '\'') {
 while (c = getch() != '\'')
 ;
 return c;
 }
 if (c == '\"') {
 while (c = getch() != '\"')
 ;
 return c;
 }
 if (!isalpha(c) && c != '_' && c != '#') {
 *w = '0円';
 last = c;
 return c;
 }
 for ( ; --lim > 0; w++) {
 if (!isalnum(*w = getch()) && *w != '_') {
 ungetch(*w);
 break;
 }
 }
 *w = '0円';
 last = word[0];
 return word[0];
}
int binsearch(char * word, struct key tab[], int n) {
 int cond;
 int low, high, mid;
 low = 0;
 high = n - 1;
 while (low <= high) {
 mid = (low + high) / 2;
 if ((cond = strcmp(word, tab[mid].word)) < 0)
 high = mid - 1;
 else if (cond > 0)
 low = mid + 1;
 else
 return mid;
 }
 return -1;
}

What are your opinions about my solution? Is it a correct approach? Is this code enough fast?

asked Jul 21, 2018 at 16:01
\$\endgroup\$

2 Answers 2

1
\$\begingroup\$
  • The last logic is quite hard to trace, and it is prone to bugs. Consider /* ... */*. Once the code reached the closing /, it returns, but last is not updated, and remains /, so the next * is still treated as a beginning of a comment.

  • Handling of string constants has a definite problem: it doesn't account for an escaped quote char in the middle of the string (e.g. "aaa\"bbb").

Combining the two above observations, I strongly recommend to revisit the design, and realize few more functions, such as get_line_comment, get_c_comment, get_quoted_string etc.

  • No word in keytab starts with an underscore. You don't need to go into binsearch in that case.

  • The preprocessor directive may not appear in the middle of the line. I am not sure if a directive in such case should be counted.

answered Jul 21, 2018 at 18:08
\$\endgroup\$
1
\$\begingroup\$

Please don't do this:

struct key {
 char * word;
 int count;
} keytab[] = {
 { "#define", 0 },
 { "#elif", 0 },
...

Separate out the definition of the struct from the declaration of the variable. This is very hard to read, unexpected, and smacks of cleverness for cleverness' sake. It also saves you all of maybe 10-15 characters at the cost of readability.

answered Jul 22, 2018 at 1:34
\$\endgroup\$

Your Answer

Draft saved
Draft discarded

Sign up or log in

Sign up using Google
Sign up using Email and Password

Post as a guest

Required, but never shown

Post as a guest

Required, but never shown

By clicking "Post Your Answer", you agree to our terms of service and acknowledge you have read our privacy policy.