I recently read K&R The C Programming Language and wanted to write a small program to count the occurrence of each word in the input (std-input - I piped a file with all shakespeare pieces in there).
Since I wanted to learn how to split source files into multiple files with this project I pretty much outsources every function I could think of. This is the part I'm most unsure, whether its bad practice or not (I include multiple libraries from the std-lib multiple times. Is this a bad thing?). You'll see.
find_words.c
#include <stdio.h>
#include <ctype.h>
#include <stdlib.h>
#include "word_finder.h"
#include "binary_tree.h"
#define MAX_WORD 20
int main(int argc, char *argv[])
{
char word[MAX_WORD];
struct tnode *root = NULL;
int min_count = 1500;
int max_count = 20000;
if (argc > 3) {
printf("Usage: ./find_words [min_count] [max_count] <INPUT_PIPE\n");
return 0;
}
if(argv[1] != NULL)
min_count = atoi(argv[1]);
if(argv[2] != NULL)
max_count = atoi(argv[2]);
while(get_word(word, MAX_WORD) != 0)
if(isalpha(word[0]))
root = addtree(root, word);
treeprint(root, min_count, max_count);
return 1;
}
word_finder.h
int get_word(char *word, int limit);
word_finder.c
#include <ctype.h>
#include <stdio.h>
#include "io.h"
int get_word(char *word, int size)
{
int c;
while((isspace(c = getch())))
;
if (c == EOF)
return 0;
int i = 0;
do
{
*(word+i++) = c;
} while (isalpha(c = getch()) && i < size-1);
*(word+i) = '0円';
ungetch(c);
return i;
}
io.h
int getch(void);
void ungetch(int);
io.c
#include <stdio.h>
#include "io.h"
#define BUFSIZE 100
/* Buffer can hold up to BUFSIZE - 1 characters */
char buf[BUFSIZE];
int bufhead = 0;
int buftail = 0;
int getch(void)
{
if (bufhead == buftail)
return getchar();
else {
int temp = buf[buftail];
buftail = (buftail + 1) % BUFSIZE;
return temp;
}
}
void ungetch(int c)
{
if ((bufhead + 1) % BUFSIZE == buftail % BUFSIZE) {
fprintf(stderr, "Buffer full, dropped %c.\n", c);
} else {
buf[bufhead] = c;
bufhead = (bufhead + 1) % BUFSIZE;
}
}
binary_tree.h
struct tnode {
char *word;
int count;
struct tnode *right;
struct tnode *left;
};
struct tnode *addtree(struct tnode *, char *);
void treeprint(struct tnode *, int, int);
binary_tree.c
#include "binary_tree.h"
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
static char *strduplicate(const char *);
static struct tnode *talloc(void);
struct tnode *addtree(struct tnode *node, char *word)
{
int cond;
if(node == NULL) { //The word doesnt exist in the tree
node = talloc();
node->word = strduplicate(word);
node->count = 1;
node->left = node->right = NULL;
} else if((cond = strcmp(word, node->word)) == 0) {
node->count++;
} else if(cond < 0) {
node->left = addtree(node->left, word);
} else {
node->right = addtree(node->right, word);
}
return node;
}
static struct tnode *talloc(void)
{
return (struct tnode *) malloc(sizeof(struct tnode));
}
static char *strduplicate(const char *s)
{
char *p;
p = (char *) malloc(strlen(s)+1);
if(p != NULL)
strcpy(p,s);
return p;
}
void treeprint(struct tnode *node, int min_count, int max_count)
{
if(node != NULL) {
treeprint(node->left, min_count, max_count);
if(node->count >= min_count && node->count <= max_count)
printf("%4d %s\n", node->count, node->word);
treeprint(node->right, min_count, max_count);
}
}
I'm sorry for the long post (I'm not 100% sure if this is allowed - I'll delete the post immediately if it's not).
1 Answer 1
"whether its bad practice or not" to sub-divide. No, not bad practice - in fact good in certain areas. It often comes down to maintenance - how many functions per file and the optimal answer is highly problem dependent. IMO, too often files need fewer functions than code needs fewer files.
ungetch()
have troubles as it ungets aEOF
. This special value should not be unget-able. Further – I have doubts that whenchar
issigned
, that these 2 functions properly handle some char values as the realgetc()
returnsunsigned char
andEOF
. Example: If(char) 255
is un-gotten, on get, it may have the value ofEOF
. Recommend making your unget bufferunsigned char
.Unclear to the whole need for user
unget/get
functionality as standard library ones well handle 1 level ofget/unget
.The whole
addtree()
, should input be alphabetical, devolves into a linked list. Consider AVL trees.Below test
i
againstsize
after the assignment. Recommend test before to cope with pathologicalsize
values like 1. Recommend re-write - avoiding code like(word+i++)
. Further, considersize_t
rather thanint
for array indexes.// from do { *(word+i++) = c; } while (isalpha(c = getch()) && i < size-1); // to if (size > 0) size--; while (i < size) { word[i++] = c; c = getch() if (!isalpha(c)) { break; } }
MAX_WORD
is fuzzy. SuggestMAX_WORD_SIZE
.ungetch()
andgetch()
look way too much like standard functions. Suggestio_ungetch()
andio_getch()
.Cast not needed.
// return (struct tnode *) malloc(sizeof(struct tnode)); return malloc(sizeof(struct tnode)); // p = (char *) malloc(strlen(s)+1); p = malloc(strlen(s)+1);
Magic number 4. Why 4?
// printf("%4d %s\n", node->count, node->word); printf("%d %s\n", node->count, node->word);
If a function does not modify a pointer's contents, use
const
.// void treeprint(struct tnode *node, int min_count, int max_count) void treeprint(const struct tnode *node, int min_count, int max_count)
Minor: .h file
void ungetch(int);
-->void ungetch(int ch);
Also add a little documentation of the .h files, it goes a long way in understanding.Minor: spelling:
doesnt
-->doesn't
Pedantic point:
char
may be signed andis...()
defined forunsigned char
andEOF
, notsigned char
.// if(isalpha(word[0])) if(isalpha((unsigned char) word[0]))
-
\$\begingroup\$ Good and comprehensive one. A few points: #8: A good link: stackoverflow.com/questions/605845/… #9: That's a format-string, and the exact choice of format is quite arbitrary. #11: Yep, that's one of the places including the name is good documentation: codereview.stackexchange.com/questions/90111/…. \$\endgroup\$Deduplicator– Deduplicator2015年11月13日 20:05:24 +00:00Commented Nov 13, 2015 at 20:05