3
\$\begingroup\$

I'm working on my implementaion of the linux wc command. I finally have something that is working properly (with no options yet) but i think it needs a lot of "cleaning". I mean, i highly disrespect the "do not repeat yourself rule" and i want to hear some ideas on how can i polish this up a bit.

Here 's the code:

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <getopt.h>
#define BUF_SIZE 1024
typedef struct Total{ //stores the Total count of lines words and bytes
 int lines;
 int words;
 int bytes;
} Total;
Total* new_Total(int lines, int words, int bytes){ //constructor function for Total
 Total* t = (Total*)malloc(sizeof(Total));
 t->lines = lines;
 t->words = words;
 t->bytes = bytes;
 return t;
}
void updateTotal(Total *t, int lines, int words, int bytes){ //updates Total counters
 t->lines += lines;
 t->words += words;
 t->bytes += bytes;
}
void readStdin(Total* t, char c){ //reads from stdin, counts and prints
 int lines = 0, words = 0, bytes = 0, startWord = 0;
 char ch;
 while((ch=fgetc(stdin)) != EOF){//count the bytes, lines and words
 bytes++;
 if (ch == '\n'){
 lines++;
 }
 if (ch != ' ' && ch != '\n' && ch != '\r' && ch != '\t'){ //if there's a visible char, there's a word
 startWord = 1;
 }
 if ((ch == ' ' || ch == '\n' || ch == '\r' || ch == '\t') && startWord == 1){ //when the word ends
 words++; //increment the word coutner
 startWord = 0; //reset word finder
 }
 }
 updateTotal(t, lines, words, bytes);
 if (c == '0円'){ //if the command was called with no arguments
 printf ("%d %d %d\n", lines, words, bytes);
 }
 else{
 printf ("%d %d %d %c\n", lines, words, bytes, c);
 }
}
void readFile(Total *t, char* filename, FILE* fp){
 int lines = 0, words = 0, bytes = 0, startWord = 0;
 char ch;
 if (fp == NULL){
 printf("%s: No such file or drectory\n", filename);
 exit(1);
 }
 while ((ch=fgetc(fp)) != EOF){ //count the bytes, lines and words
 bytes++;
 if (ch == '\n'){
 lines++;
 }
 if (ch != ' ' && ch != '\n' && ch != '\r' && ch != '\t'){
 startWord = 1;
 }
 if ((ch == ' ' || ch == '\n' || ch == '\r' || ch == '\t') && startWord == 1){
 words++;
 startWord = 0;
 }
 }
 updateTotal(t, lines, words, bytes);
 printf("%d %d %d %s\n", lines, words, bytes, filename);
}
void readArgs(Total* t, int argc, char* argv[]){
 FILE* fp;
 for (int i=1; i<argc; i++){
 if (*argv[i] == '-'){//if a '-' is found, read from stdin
 readStdin(t, '-');
 clearerr(stdin);
 }
 else { //tries to read the file in argv[i]
 fp = fopen(argv[i], "r");
 readFile(t, argv[i], fp);
 fclose(fp);
 }
 }
 if (argc > 2){ //if there's more than one file, print the total in the end
 printf("%d %d %d total\n", t->lines, t->words, t->bytes);
 }
 else {
 exit(0);
 }
}
int main(int argc, char* argv[]){
 Total* t = new_Total(0,0,0);
 if (argc<2){ //no arguments
 readStdin(t,'0円'); //pass /0 to readstin because as it is the only time it is called, there's no need to append the name of the argument
 return 0;
 }
 readArgs(t, argc, argv);
 free(t);
 return 0;
}

As you can see, both in readStdin() and in readFile() i have the same piece of code that handles the counting. I've tried to make a function out of this using pointers but i gave up. Any suggestion would be much appreciated. Also, if you have any hint that has nothing to do with the problem that i mentioned, shoot it up!

200_success
145k22 gold badges190 silver badges478 bronze badges
asked Apr 8, 2019 at 20:20
\$\endgroup\$
1
  • \$\begingroup\$ Why describe it as Linux wc command? wc has been around much longer than Linux has! \$\endgroup\$ Commented Apr 9, 2019 at 15:49

4 Answers 4

4
\$\begingroup\$

A standard way to do this is to break the function up into smaller 'private' (static) functions, each with one thing to do. One can pass the default to the counting function in case we are not provided the argument. This allows separation of the code which counts and the UI.

Total is always going to be called with (0, 0, 0), it's unnecessary to provide these arguments. In fact, the storage of Total in dynamic memory is probably unnecessary. No need to updateTotal, use of the counters directly is fine. In fact, there is going to be 1 total, might as well make it static.

fgetc returns an int, char ch is not enough to tell EOF, which is -1. You've defined BUF_SIZE, an excellent optimization, but never use it. I'd go from fgetc to fgets. Note that this assumes that the file is text and doesn't contain a '0円' inside, (eg, modified UTF-8.)

You've included getopt and unistd, making your code POSIX C99, but you're not using those. Removing them makes your code more general C99 (now your code will compile on more compilers.)

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <errno.h>
//stores the Total count of lines words and bytes
struct Total {
 size_t lines, words, bytes;
} total;
//count the bytes, lines and words
//gives incorrect results on binary files with embedded 0
//a word is delimited by everything that {isgraph}
//using the unix definition of a text file, {lines = count('\n')}
static int readFile(FILE *fp) {
 char buffer[1024], b;
 int startWord = 1;
 size_t i;
 while(fgets(buffer, (int)sizeof buffer, fp)) {
 i = 0;
 do {
 b = buffer[i++];
 if(b == '\n') {
 /* This will not recognize '\l' in old Mac text files opened
 with "wb"? Hmm. */
 total.lines++;
 startWord = 1;
 } else if(b == '0円') {
 break;
 } else if(isgraph(b)) {
 //if there's a visible char, there's a word
 if(startWord) total.words++, startWord = 0;
 } else {
 startWord = 1;
 }
 } while(1);
 total.bytes += i;
 }
 return ferror(fp) ? 0 : 1;
}
int main(int argc, char *argv[]) {
 FILE *fp = 0;
 char *fn = "<no file>";
 for(int i = 1; i < argc; i++) {
 fn = argv[i];
 fp = fopen(fn, "r");
 if(!fp || !readFile(fp)) break;
 fclose(fp), fp = 0;
 }
 if(argc <= 1) readFile(stdin);
 if(fp) fclose(fp);
 if(errno) return perror(fn), EXIT_FAILURE;
 /* https://stackoverflow.com/a/2524675/2472827 */
 printf("%zu lines; %zu words; %zu bytes total.\n",
 total.lines, total.words, total.bytes);
 return EXIT_SUCCESS;
}

Some see typedef struct as problematic, and, in my opinion, it's not needed here; see Linux kernel coding style. INT_MAX could be as little as 32,767; it's never going to be less than zero, so I'd prefer unsigned, or size_t in this context. What if there are other chars besides '\n', ' ', '\r', '\t'? I've included <ctype.h> to get rid of the ambiguity. There was a bug counting the word at the end of the line. Now, a line is defined as the number of '\n' (UNIX style.) A word is delimited by isgraph.

answered Apr 8, 2019 at 23:28
\$\endgroup\$
3
\$\begingroup\$

I see a number of things that may help you improve your program.

Use only the required #includes

As far as I can tell, nothing from <getopt.h> is actually used, so that could be omitted.

Eliminate unused definitions

The BUF_SIZE definition is not used and should be eliminated.

Avoid allocating memory if possible

There's not really much need here for malloc and free. The Total structure is small and easily fits on the stack frame. Avoiding malloc means you'll never have to worry about freeing the memory later.

Prefer a switch to a long if...else

The while loop only examines ch but it does so inefficiently because potentially, it examines the same character multiple times. A switch would be more efficient, shorter and more easily understood here.

Use bool where appropriate

The startWord variable is apparently being used as a boolean variable. For that reason, I'd suggest using #include <stdbool.h> and declaring it as bool.

Use const where practical

The filename argument is not and should not be altered by the readFile function. For that reason, it would be better to declare that argument const.

Consolidate code by combining cases

Instead of having completely separate readStdin and readFile functions, it would be sensible to simply retain only readFile and initialize the FILE structure like this:

FILE *file = (filename && filename[0] == '-') ? stdin : fopen(filename, "r");

Use the appropriate printf format string

The code uses %d for the format string, but would it ever make sense for the passed arguments to be negative? I would say not, and would suggest instead that these variables would more appropriately be size_t and that the format string should be %lu or %u depending on your compiler's implementation.

Simplify code by using functions

There are four places in the code that use printf to show totals. These could instead be reduced to a single place by creating a simple function:

static void report(const char *filename, size_t lines, size_t words, size_t letters) {
 printf("%5lu %5lu %5lu %s\n", lines, words, letters, filename);
}

Don't quit after a single error

The way that wc works is that it will keep going to process other files if one happens not to exist. To mimic that behavior, you'd have to alter your code to not exit if the file does not exist.

Carefully consider whether a struct is worthwhile

I'm all for consolidating variables into meaningfully named structs as you have done, but I think the code is simpler using the three plain variables. Here's my complete version which incorporates all of these suggestions so that you can compare for yourself:

#include <stdio.h>
#include <stdbool.h>
static void count(const char *filename, size_t *lines, size_t *words, size_t *letters) {
 FILE *file = (filename && filename[0] == '-') ? stdin : fopen(filename, "r");
 if (file == NULL) {
 printf("%s: No such file or drectory\n", filename);
 return;
 }
 bool inword = false;
 for (int ch = fgetc(file); ch != EOF; ch = fgetc(file)) {
 switch (ch) {
 case '\n':
 ++*lines;
 // fall through
 case ' ': 
 case '\r':
 case '\t':
 if (inword) {
 ++*words;
 }
 inword = false;
 break;
 default:
 inword = true; 
 }
 ++*letters;
 }
 fclose(file);
}
static void report(const char *filename, size_t lines, size_t words, size_t letters) {
 printf("%5lu %5lu %5lu %s\n", lines, words, letters, filename);
}
int main(int argc, char* argv[]) {
 size_t totallines = 0;
 size_t totalwords = 0;
 size_t totalletters = 0;
 if (argc < 2) {
 puts("Usage: wc filename [filename ...]");
 return 0;
 }
 for (int i=1; i < argc; ++i) {
 size_t lines = 0;
 size_t words = 0;
 size_t letters = 0;
 count(argv[i], &lines, &words, &letters);
 report(argv[i], lines, words, letters);
 totallines += lines;
 totalwords += words;
 totalletters += letters;
 }
 if (argc > 2) {
 report("total", totallines, totalwords, totalletters);
 }
}
answered Apr 9, 2019 at 14:03
\$\endgroup\$
4
  • \$\begingroup\$ That's good advice. I would say that a struct is probably worthwhile, since the data are strongly linked together. \$\endgroup\$ Commented Apr 9, 2019 at 22:07
  • 1
    \$\begingroup\$ @NeilEdelman: Yes, I could definitely argue it either way. In C++ I would definitely use a class (primarily for the operator+= and ostream<< functions) but I don't see as much benefit in this particular C code. Grouping can be handy, but it's also pretty easy to remember three or four well-named variables. \$\endgroup\$ Commented Apr 9, 2019 at 23:27
  • \$\begingroup\$ I don't know C++ as well as C, but isn't a struct and a class the same wrt operator? \$\endgroup\$ Commented Apr 10, 2019 at 20:37
  • 1
    \$\begingroup\$ @NeilEdelman: The difference is that one can overload operators for custom struct or class types in C++, but one cannot overload operators at all in C. \$\endgroup\$ Commented Apr 12, 2019 at 0:44
2
\$\begingroup\$

Don't cast the result of malloc(); it's unnecessary clutter and it makes it harder to make changes. For the same reason, it's better to use the pointed-to value rather than the type as argument to sizeof:

Total *t = malloc(sizeof *t);

We need to test the result of malloc() is not null before using it, else we run into Undefined Behaviour.

However, it's unnecessary to use dynamic allocation at all. Instead of new_Total() we could provide init_Total() to initialise an existing (e.g. stack-allocated) value:

void init_Total(Total *t, int lines, int words, int bytes)
{
 assert(t);
 //constructor function for Total
 t->lines = lines;
 t->words = words;
 t->bytes = bytes;
}

Then, main() can begin like this:

int main(int argc, char* argv[])
{
 Total t;
 init_Total(&t, 0, 0, 0);

We can use a single function for files and stdin:

static bool readStream(Total *t, FILE* fp)
{
 // common code here
}
bool readFile(Total *t, char* filename)
{
 FILE *const f = fopen(filename, "r");
 if (!f) {
 perror(filename);
 return false;
 }
 return readStream(t, f) & fclose(f);
}
bool readStdin(Total *t)
{
 return readStream(t, stdin);
}

You might even feel that naming readStdin() is overkill, and simply call readStream(t, stdin); directly from main().

answered Apr 9, 2019 at 10:53
\$\endgroup\$
0
\$\begingroup\$

Well, i actually did it on my own so, if anyone's interested, here's the code:

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <getopt.h>
#define BUF_SIZE 1024
typedef struct Counter{ //stores the count of lines, words, bytes and the value of starword
 int lines;
 int words;
 int bytes;
 int wordFlag; //when counting words, if one is found, set to 1. when digested, reset.
} Counter;
Counter* new_Counter(int lines, int words, int bytes, int wordFlag){ //constructor function for Counter
 Counter* t = (Counter*)malloc(sizeof(Counter));
 t->lines = lines;
 t->words = words;
 t->bytes = bytes;
 t->wordFlag = wordFlag;
 return t;
}
void updateTotal(Counter *t, int lines, int words, int bytes){ //updates the total counter
 t->lines += lines;
 t->words += words;
 t->bytes += bytes;
}
void count(char ch, Counter* cnt){//counts the bytes, lines and words
 cnt->bytes++; //if a char is passed, count it
 if (ch == '\n'){ //count each line
 cnt->lines++;
 }
 if (ch != ' ' && ch != '\n' && ch != '\r' && ch != '\t'){ //if there's a visible char, there's a word
 cnt->wordFlag = 1;
 }
 if ((ch == ' ' || ch == '\n' || ch == '\r' || ch == '\t') && cnt->wordFlag == 1){ //when the word ends
 cnt->words++; //increment the word counter
 cnt->wordFlag = 0; //reset the flag
 }
}
void readStdin(Counter* t, char c){ //reads from stdin and prints
 Counter* cnt = new_Counter(0,0,0,0);
 char ch;
 while((ch=fgetc(stdin)) != EOF){
 count(ch, cnt);
 }
 updateTotal(t, cnt->lines, cnt->words, cnt->bytes);
 if (c == '0円'){ //if the command was called with no arguments
 printf ("%d %d %d\n", cnt->lines, cnt->words, cnt->bytes);
 }
 else{
 printf ("%d %d %d %c\n", cnt->lines, cnt->words, cnt->bytes, c);
 }
 free(cnt);
}
void readFile(Counter *t, char* filename, FILE* fp){
 Counter* cnt = new_Counter(0,0,0,0);
 char ch;
 if (fp == NULL){
 fprintf(stderr, "./my_wc: %s: No such file or directory\n", filename);
 exit(1);
 }
 while ((ch=fgetc(fp)) != EOF){ //count the bytes, lines and words
 count(ch, cnt);
 }
 updateTotal(t, cnt->lines, cnt->words, cnt->bytes);
 printf("%d %d %d %s\n", cnt->lines, cnt->words, cnt->bytes, filename);
}
void readArgs(Counter* t, int argc, char* argv[]){
 FILE* fp;
 for (int i=1; i<argc; i++){
 if (*argv[i] == '-'){//if a '-' is found, read from stdin
 readStdin(t, '-');
 clearerr(stdin);
 }
 else { //tries to read the file in argv[i]
 fp = fopen(argv[i], "r");
 readFile(t, argv[i], fp);
 fclose(fp);
 }
 }
 if (argc > 2){ //if there's more than one file, print the Counter in the end
 printf("%d %d %d total\n", t->lines, t->words, t->bytes);
 }
 else {
 exit(0);
 }
}
int main(int argc, char* argv[]){
 Counter* t = new_Counter(0,0,0,0); //pointer to the total counter
 if (argc<2){ //no arguments
 readStdin(t,'0円'); //pass /0 to readstin because as it is the only time it is called, there's no need to append the name of the argument
 return 0;
 }
 readArgs(t, argc, argv);
 free(t);
 return 0;
}

As you can see, i renamed the struct to Counter and i used it to store, not only the total counter, but also every temporary counter, then i implemented the function count() that does with the temporary counter values what those if statements in readStdin and readFile do. If you still have any suggestions, please do not hesitate.

answered Apr 8, 2019 at 21:57
\$\endgroup\$

Your Answer

Draft saved
Draft discarded

Sign up or log in

Sign up using Google
Sign up using Email and Password

Post as a guest

Required, but never shown

Post as a guest

Required, but never shown

By clicking "Post Your Answer", you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.