I'm working on my implementaion of the linux wc command. I finally have something that is working properly (with no options yet) but i think it needs a lot of "cleaning". I mean, i highly disrespect the "do not repeat yourself rule" and i want to hear some ideas on how can i polish this up a bit.
Here 's the code:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <getopt.h>
#define BUF_SIZE 1024
typedef struct Total{ //stores the Total count of lines words and bytes
int lines;
int words;
int bytes;
} Total;
Total* new_Total(int lines, int words, int bytes){ //constructor function for Total
Total* t = (Total*)malloc(sizeof(Total));
t->lines = lines;
t->words = words;
t->bytes = bytes;
return t;
}
void updateTotal(Total *t, int lines, int words, int bytes){ //updates Total counters
t->lines += lines;
t->words += words;
t->bytes += bytes;
}
void readStdin(Total* t, char c){ //reads from stdin, counts and prints
int lines = 0, words = 0, bytes = 0, startWord = 0;
char ch;
while((ch=fgetc(stdin)) != EOF){//count the bytes, lines and words
bytes++;
if (ch == '\n'){
lines++;
}
if (ch != ' ' && ch != '\n' && ch != '\r' && ch != '\t'){ //if there's a visible char, there's a word
startWord = 1;
}
if ((ch == ' ' || ch == '\n' || ch == '\r' || ch == '\t') && startWord == 1){ //when the word ends
words++; //increment the word coutner
startWord = 0; //reset word finder
}
}
updateTotal(t, lines, words, bytes);
if (c == '0円'){ //if the command was called with no arguments
printf ("%d %d %d\n", lines, words, bytes);
}
else{
printf ("%d %d %d %c\n", lines, words, bytes, c);
}
}
void readFile(Total *t, char* filename, FILE* fp){
int lines = 0, words = 0, bytes = 0, startWord = 0;
char ch;
if (fp == NULL){
printf("%s: No such file or drectory\n", filename);
exit(1);
}
while ((ch=fgetc(fp)) != EOF){ //count the bytes, lines and words
bytes++;
if (ch == '\n'){
lines++;
}
if (ch != ' ' && ch != '\n' && ch != '\r' && ch != '\t'){
startWord = 1;
}
if ((ch == ' ' || ch == '\n' || ch == '\r' || ch == '\t') && startWord == 1){
words++;
startWord = 0;
}
}
updateTotal(t, lines, words, bytes);
printf("%d %d %d %s\n", lines, words, bytes, filename);
}
void readArgs(Total* t, int argc, char* argv[]){
FILE* fp;
for (int i=1; i<argc; i++){
if (*argv[i] == '-'){//if a '-' is found, read from stdin
readStdin(t, '-');
clearerr(stdin);
}
else { //tries to read the file in argv[i]
fp = fopen(argv[i], "r");
readFile(t, argv[i], fp);
fclose(fp);
}
}
if (argc > 2){ //if there's more than one file, print the total in the end
printf("%d %d %d total\n", t->lines, t->words, t->bytes);
}
else {
exit(0);
}
}
int main(int argc, char* argv[]){
Total* t = new_Total(0,0,0);
if (argc<2){ //no arguments
readStdin(t,'0円'); //pass /0 to readstin because as it is the only time it is called, there's no need to append the name of the argument
return 0;
}
readArgs(t, argc, argv);
free(t);
return 0;
}
As you can see, both in readStdin() and in readFile() i have the same piece of code that handles the counting. I've tried to make a function out of this using pointers but i gave up. Any suggestion would be much appreciated. Also, if you have any hint that has nothing to do with the problem that i mentioned, shoot it up!
4 Answers 4
A standard way to do this is to break the function up into smaller 'private' (static
) functions, each with one thing to do. One can pass the default to the counting function in case we are not provided the argument. This allows separation of the code which counts and the UI.
Total
is always going to be called with (0, 0, 0)
, it's unnecessary to provide these arguments. In fact, the storage of Total
in dynamic memory is probably unnecessary. No need to updateTotal
, use of the counters directly is fine. In fact, there is going to be 1 total
, might as well make it static.
fgetc
returns an int
, char ch
is not enough to tell EOF
, which is -1. You've defined BUF_SIZE
, an excellent optimization, but never use it. I'd go from fgetc
to fgets
. Note that this assumes that the file is text and doesn't contain a '0円'
inside, (eg, modified UTF-8.)
You've included getopt
and unistd
, making your code POSIX C99
, but you're not using those. Removing them makes your code more general C99
(now your code will compile on more compilers.)
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <errno.h>
//stores the Total count of lines words and bytes
struct Total {
size_t lines, words, bytes;
} total;
//count the bytes, lines and words
//gives incorrect results on binary files with embedded 0
//a word is delimited by everything that {isgraph}
//using the unix definition of a text file, {lines = count('\n')}
static int readFile(FILE *fp) {
char buffer[1024], b;
int startWord = 1;
size_t i;
while(fgets(buffer, (int)sizeof buffer, fp)) {
i = 0;
do {
b = buffer[i++];
if(b == '\n') {
/* This will not recognize '\l' in old Mac text files opened
with "wb"? Hmm. */
total.lines++;
startWord = 1;
} else if(b == '0円') {
break;
} else if(isgraph(b)) {
//if there's a visible char, there's a word
if(startWord) total.words++, startWord = 0;
} else {
startWord = 1;
}
} while(1);
total.bytes += i;
}
return ferror(fp) ? 0 : 1;
}
int main(int argc, char *argv[]) {
FILE *fp = 0;
char *fn = "<no file>";
for(int i = 1; i < argc; i++) {
fn = argv[i];
fp = fopen(fn, "r");
if(!fp || !readFile(fp)) break;
fclose(fp), fp = 0;
}
if(argc <= 1) readFile(stdin);
if(fp) fclose(fp);
if(errno) return perror(fn), EXIT_FAILURE;
/* https://stackoverflow.com/a/2524675/2472827 */
printf("%zu lines; %zu words; %zu bytes total.\n",
total.lines, total.words, total.bytes);
return EXIT_SUCCESS;
}
Some see typedef struct
as problematic, and, in my opinion, it's not needed here; see Linux kernel coding style. INT_MAX
could be as little as 32,767; it's never going to be less than zero, so I'd prefer unsigned
, or size_t
in this context. What if there are other chars besides '\n', ' ', '\r', '\t'
? I've included <ctype.h>
to get rid of the ambiguity. There was a bug counting the word at the end of the line. Now, a line is defined as the number of '\n'
(UNIX style.) A word is delimited by isgraph
.
I see a number of things that may help you improve your program.
Use only the required #include
s
As far as I can tell, nothing from <getopt.h>
is actually used, so that could be omitted.
Eliminate unused definitions
The BUF_SIZE
definition is not used and should be eliminated.
Avoid allocating memory if possible
There's not really much need here for malloc
and free
. The Total
structure is small and easily fits on the stack frame. Avoiding malloc
means you'll never have to worry about free
ing the memory later.
Prefer a switch
to a long if...else
The while
loop only examines ch
but it does so inefficiently because potentially, it examines the same character multiple times. A switch
would be more efficient, shorter and more easily understood here.
Use bool
where appropriate
The startWord
variable is apparently being used as a boolean variable. For that reason, I'd suggest using #include <stdbool.h>
and declaring it as bool
.
Use const
where practical
The filename
argument is not and should not be altered by the readFile
function. For that reason, it would be better to declare that argument const
.
Consolidate code by combining cases
Instead of having completely separate readStdin
and readFile
functions, it would be sensible to simply retain only readFile
and initialize the FILE
structure like this:
FILE *file = (filename && filename[0] == '-') ? stdin : fopen(filename, "r");
Use the appropriate printf
format string
The code uses %d
for the format string, but would it ever make sense for the passed arguments to be negative? I would say not, and would suggest instead that these variables would more appropriately be size_t
and that the format string should be %lu
or %u
depending on your compiler's implementation.
Simplify code by using functions
There are four places in the code that use printf
to show totals. These could instead be reduced to a single place by creating a simple function:
static void report(const char *filename, size_t lines, size_t words, size_t letters) {
printf("%5lu %5lu %5lu %s\n", lines, words, letters, filename);
}
Don't quit after a single error
The way that wc
works is that it will keep going to process other files if one happens not to exist. To mimic that behavior, you'd have to alter your code to not exit
if the file does not exist.
Carefully consider whether a struct
is worthwhile
I'm all for consolidating variables into meaningfully named struct
s as you have done, but I think the code is simpler using the three plain variables. Here's my complete version which incorporates all of these suggestions so that you can compare for yourself:
#include <stdio.h>
#include <stdbool.h>
static void count(const char *filename, size_t *lines, size_t *words, size_t *letters) {
FILE *file = (filename && filename[0] == '-') ? stdin : fopen(filename, "r");
if (file == NULL) {
printf("%s: No such file or drectory\n", filename);
return;
}
bool inword = false;
for (int ch = fgetc(file); ch != EOF; ch = fgetc(file)) {
switch (ch) {
case '\n':
++*lines;
// fall through
case ' ':
case '\r':
case '\t':
if (inword) {
++*words;
}
inword = false;
break;
default:
inword = true;
}
++*letters;
}
fclose(file);
}
static void report(const char *filename, size_t lines, size_t words, size_t letters) {
printf("%5lu %5lu %5lu %s\n", lines, words, letters, filename);
}
int main(int argc, char* argv[]) {
size_t totallines = 0;
size_t totalwords = 0;
size_t totalletters = 0;
if (argc < 2) {
puts("Usage: wc filename [filename ...]");
return 0;
}
for (int i=1; i < argc; ++i) {
size_t lines = 0;
size_t words = 0;
size_t letters = 0;
count(argv[i], &lines, &words, &letters);
report(argv[i], lines, words, letters);
totallines += lines;
totalwords += words;
totalletters += letters;
}
if (argc > 2) {
report("total", totallines, totalwords, totalletters);
}
}
-
\$\begingroup\$ That's good advice. I would say that a
struct
is probably worthwhile, since the data are strongly linked together. \$\endgroup\$Neil– Neil2019年04月09日 22:07:39 +00:00Commented Apr 9, 2019 at 22:07 -
1\$\begingroup\$ @NeilEdelman: Yes, I could definitely argue it either way. In C++ I would definitely use a
class
(primarily for theoperator+=
andostream<<
functions) but I don't see as much benefit in this particular C code. Grouping can be handy, but it's also pretty easy to remember three or four well-named variables. \$\endgroup\$Edward– Edward2019年04月09日 23:27:25 +00:00Commented Apr 9, 2019 at 23:27 -
\$\begingroup\$ I don't know C++ as well as C, but isn't a
struct
and aclass
the same wrtoperator
? \$\endgroup\$Neil– Neil2019年04月10日 20:37:49 +00:00Commented Apr 10, 2019 at 20:37 -
1\$\begingroup\$ @NeilEdelman: The difference is that one can overload operators for custom
struct
orclass
types in C++, but one cannot overload operators at all in C. \$\endgroup\$Edward– Edward2019年04月12日 00:44:41 +00:00Commented Apr 12, 2019 at 0:44
Don't cast the result of malloc()
; it's unnecessary clutter and it makes it harder to make changes. For the same reason, it's better to use the pointed-to value rather than the type as argument to sizeof
:
Total *t = malloc(sizeof *t);
We need to test the result of malloc()
is not null before using it, else we run into Undefined Behaviour.
However, it's unnecessary to use dynamic allocation at all. Instead of new_Total()
we could provide init_Total()
to initialise an existing (e.g. stack-allocated) value:
void init_Total(Total *t, int lines, int words, int bytes)
{
assert(t);
//constructor function for Total
t->lines = lines;
t->words = words;
t->bytes = bytes;
}
Then, main()
can begin like this:
int main(int argc, char* argv[])
{
Total t;
init_Total(&t, 0, 0, 0);
We can use a single function for files and stdin
:
static bool readStream(Total *t, FILE* fp)
{
// common code here
}
bool readFile(Total *t, char* filename)
{
FILE *const f = fopen(filename, "r");
if (!f) {
perror(filename);
return false;
}
return readStream(t, f) & fclose(f);
}
bool readStdin(Total *t)
{
return readStream(t, stdin);
}
You might even feel that naming readStdin()
is overkill, and simply call readStream(t, stdin);
directly from main()
.
Well, i actually did it on my own so, if anyone's interested, here's the code:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <getopt.h>
#define BUF_SIZE 1024
typedef struct Counter{ //stores the count of lines, words, bytes and the value of starword
int lines;
int words;
int bytes;
int wordFlag; //when counting words, if one is found, set to 1. when digested, reset.
} Counter;
Counter* new_Counter(int lines, int words, int bytes, int wordFlag){ //constructor function for Counter
Counter* t = (Counter*)malloc(sizeof(Counter));
t->lines = lines;
t->words = words;
t->bytes = bytes;
t->wordFlag = wordFlag;
return t;
}
void updateTotal(Counter *t, int lines, int words, int bytes){ //updates the total counter
t->lines += lines;
t->words += words;
t->bytes += bytes;
}
void count(char ch, Counter* cnt){//counts the bytes, lines and words
cnt->bytes++; //if a char is passed, count it
if (ch == '\n'){ //count each line
cnt->lines++;
}
if (ch != ' ' && ch != '\n' && ch != '\r' && ch != '\t'){ //if there's a visible char, there's a word
cnt->wordFlag = 1;
}
if ((ch == ' ' || ch == '\n' || ch == '\r' || ch == '\t') && cnt->wordFlag == 1){ //when the word ends
cnt->words++; //increment the word counter
cnt->wordFlag = 0; //reset the flag
}
}
void readStdin(Counter* t, char c){ //reads from stdin and prints
Counter* cnt = new_Counter(0,0,0,0);
char ch;
while((ch=fgetc(stdin)) != EOF){
count(ch, cnt);
}
updateTotal(t, cnt->lines, cnt->words, cnt->bytes);
if (c == '0円'){ //if the command was called with no arguments
printf ("%d %d %d\n", cnt->lines, cnt->words, cnt->bytes);
}
else{
printf ("%d %d %d %c\n", cnt->lines, cnt->words, cnt->bytes, c);
}
free(cnt);
}
void readFile(Counter *t, char* filename, FILE* fp){
Counter* cnt = new_Counter(0,0,0,0);
char ch;
if (fp == NULL){
fprintf(stderr, "./my_wc: %s: No such file or directory\n", filename);
exit(1);
}
while ((ch=fgetc(fp)) != EOF){ //count the bytes, lines and words
count(ch, cnt);
}
updateTotal(t, cnt->lines, cnt->words, cnt->bytes);
printf("%d %d %d %s\n", cnt->lines, cnt->words, cnt->bytes, filename);
}
void readArgs(Counter* t, int argc, char* argv[]){
FILE* fp;
for (int i=1; i<argc; i++){
if (*argv[i] == '-'){//if a '-' is found, read from stdin
readStdin(t, '-');
clearerr(stdin);
}
else { //tries to read the file in argv[i]
fp = fopen(argv[i], "r");
readFile(t, argv[i], fp);
fclose(fp);
}
}
if (argc > 2){ //if there's more than one file, print the Counter in the end
printf("%d %d %d total\n", t->lines, t->words, t->bytes);
}
else {
exit(0);
}
}
int main(int argc, char* argv[]){
Counter* t = new_Counter(0,0,0,0); //pointer to the total counter
if (argc<2){ //no arguments
readStdin(t,'0円'); //pass /0 to readstin because as it is the only time it is called, there's no need to append the name of the argument
return 0;
}
readArgs(t, argc, argv);
free(t);
return 0;
}
As you can see, i renamed the struct to Counter and i used it to store, not only the total counter, but also every temporary counter, then i implemented the function count() that does with the temporary counter values what those if statements in readStdin and readFile do. If you still have any suggestions, please do not hesitate.
wc
command?wc
has been around much longer than Linux has! \$\endgroup\$