Parse string into a struct

Question 1

I'm writing a C application for the Networking course at the University. Basically it's a UDP server that can receive two commands:

GET key
SET key value

That are executed against a Hash Table data structure. I'm now writing the code that parses the string into a command struct, and the logic is the following:

A command cannot be empty and is recognized only if it starts with GET or SET
A key cannot be null and cannot be longer than 100 characters.
In a SET command the data cannot be null

The code seems to work so far, but I'm not so skilled in writing C code, so I'm worried about some hidden bugs or mistakes that I did.
So, if someone can review my code it will be greatly appreciated!

The header file:

#ifndef DUCKY_COMMAND_H
#define DUCKY_COMMAND_H
#include <stddef.h>
#define MAX_KEY_LENGTH 100
#define ERR_COMMAND_NOT_RECOGNIZED -1
#define ERR_KEY_LENGTH -2
#define ERR_NO_KEY -3
#define ERR_NO_DATA -4
typedef struct command {
 enum {
 GET, SET
 } command_type;
 char *key;
 char *data;
} command;
int parse_command(char *buffer, command *c);
#endif

The function that parses a string into a command struct:

#include <stdlib.h>
#include <string.h>
#include <stddef.h>
#include "command.h"
int parse_command(char * buffer, command* c) {
 int i = 0;
 char *delimiter = " ";
 char *token = strtok(buffer, delimiter);
 char **arr = calloc(3, sizeof(char *)); // I know that a command can have at maximum 3 tokens
 // Split the string into an array of strings
 while (token != NULL) {
 token[strcspn(token, "\n")] = '0円'; // Remove trailing new lines
 arr[i] = calloc(1, strlen(token) + 1);
 strcpy(arr[i], token);
 token = strtok(NULL, delimiter); // Next token
 i++;
 }
 if (strcmp(arr[0], "SET") != 0 && strcmp(arr[0], "GET") != 0) {
 return ERR_COMMAND_NOT_RECOGNIZED;
 }
 // Parse SET command
 if (strcmp(arr[0], "SET") == 0) {
 if (arr[1] == NULL) {
 return ERR_NO_KEY;
 }
 // Validate key length
 if (strlen(arr[1]) > MAX_KEY_LENGTH) {
 return ERR_KEY_LENGTH;
 }
 // Validate data
 if (arr[2] == NULL) {
 return ERR_NO_DATA;
 }
 c->command_type = SET;
 c->key = arr[1];
 c->data = arr[2];
 }
 // Parse GET command
 if (strcmp(arr[0], "GET") == 0) {
 if (arr[1] == NULL) {
 return ERR_NO_KEY;
 }
 // Validate key length
 if (strlen(arr[1]) > MAX_KEY_LENGTH) {
 return ERR_KEY_LENGTH;
 }
 c->command_type = GET;
 c->key = arr[1];
 }
 return 0;
}

Here are the unit tests (for which I'm using greatest):

#include "../lib/greatest.h"
#include "../src/command.h"
TEST should_parse_a_SET_command_from_a_string(void) {
 command c;
 char buffer[] = "SET key string";
 int result = parse_command(buffer, &c);
 ASSERT_EQ(0, result);
 ASSERT_EQ(SET, c.command_type);
 ASSERT_STR_EQ("key", c.key);
 ASSERT_STR_EQ("string", c.data);
 PASS();
}
TEST should_return_ERR_NO_KEY_if_the_SET_command_has_not_an_associated_key(void) {
 command c;
 char buffer[] = "SET";
 int result = parse_command(buffer, &c);
 ASSERT_EQ(ERR_NO_KEY, result);
 PASS();
}
TEST should_return_ERR_NO_DATA_if_the_SET_command_has_no_data_associated(void) {
 command c;
 char buffer[] = "SET key ";
 int result = parse_command(buffer, &c);
 ASSERT_EQ(ERR_NO_DATA, result);
 PASS();
}
TEST should_parse_a_GET_command_from_a_string(void) {
 command c;
 char buffer[] = "GET key";
 int result = parse_command(buffer, &c);
 ASSERT_EQ(0, result);
 ASSERT_EQ(GET, c.command_type);
 ASSERT_STR_EQ("key", c.key);
 PASS();
}
TEST should_return_ERR_NO_KEY_if_the_GET_command_has_not_an_associated_key(void) {
 command get;
 char buffer[] = "GET";
 int result = parse_command(buffer, &get);
 ASSERT_EQ(ERR_NO_KEY, result);
 PASS();
}
TEST should_return_ERR_KEY_LENGTH_if_the_GET_or_SET_key_length_is_greater_than_100_chars(void) {
 int result;
 command get, set;
 char get_buffer[] = "GET abcdeabcdeabcdeabcdeabcdeabcdeabcdeabcdeabcdeabcdeabcdeabcdeabcdeabcdeabcdeabcdeabcdeabcdeabcdeabcdebcde 10 string";
 char set_buffer[] = "SET abcdeabcdeabcdeabcdeabcdeabcdeabcdeabcdeabcdeabcdeabcdeabcdeabcdeabcdeabcdeabcdeabcdeabcdeabcdeabcdebcde 10 string";
 result = parse_command(get_buffer, &get);
 ASSERT_EQ(ERR_KEY_LENGTH, result);
 result = parse_command(set_buffer, &set);
 ASSERT_EQ(ERR_KEY_LENGTH, result);
 PASS();
}
TEST should_return_ERR_COMMAND_NOT_RECOGNIZED_if_the_command_is_not_recognized() {
 command c;
 char buffer[] = "FOO BAR key 10 20";
 int result = parse_command(buffer, &c);
 ASSERT_EQ(ERR_COMMAND_NOT_RECOGNIZED, result);
 PASS();
}
SUITE(suite) {
 RUN_TEST(should_parse_a_SET_command_from_a_string);
 RUN_TEST(should_return_ERR_NO_KEY_if_the_SET_command_has_not_an_associated_key);
 RUN_TEST(should_return_ERR_NO_DATA_if_the_SET_command_has_no_data_associated);
 RUN_TEST(should_parse_a_GET_command_from_a_string);
 RUN_TEST(should_return_ERR_NO_KEY_if_the_GET_command_has_not_an_associated_key);
 RUN_TEST(should_return_ERR_KEY_LENGTH_if_the_GET_or_SET_key_length_is_greater_than_100_chars);
 RUN_TEST(should_return_ERR_COMMAND_NOT_RECOGNIZED_if_the_command_is_not_recognized);
}
GREATEST_MAIN_DEFS();
int main(int argc, char *argv[]) {
 GREATEST_MAIN_BEGIN();
 RUN_SUITE(suite);
 GREATEST_MAIN_END();
}

Question 2

You calloc arr, but you know the length of it is always 3. So you can instead allocate it on the stack using C's built in array type. The = {0} part means that all elements are set to all 0's (NULL's) at the start, which calloc does for you automatically.
```
 char *arr[3] = {0};
```
When you're splitting the input buffer on spaces, you try and write to arr even if you get more than three arguments. This is easily fixed by returning an error if i == 3 in the loop. (Exercise for the reader: set a flag instead of immediately returning so that if there are other errors, like if the command is unknown, you can return those instead. Or make the errors powers of two and return them OR'ed together, so that you can return multiple errors in one go.)
Instead of copying the string inside buffer, we can just point to that string in arr. Note, however, that this means that buffer must outlive the command. If that causes problems, just don't use this modification.

I would also make it clearer that i is only used inside that while loop, and rewrite it as a for loop.

 for (int i = 0; token != NULL; i++) {
 if (i >= 3) return ERR_TOO_MANY_ARGUMENTS; // A new error type for the .h file
 token[strcspn(token, "\n")] = '0円'; // Remove trailing new lines
 arr[i] = token;
 token = strtok(NULL, delimiter); // Next token
 }

You can replace all instances of arr[n] with more readable names by #defineing macros:
```
 #define COMMAND (arr[0])
 #define KEY (arr[1])
 #define DATA (arr[2])
```
You can replace strcmp(...) != 0 with strcmp(...) and strcmp(...) == 0 with !strcmp(...). (Don't worry, code with all the modifications will be posted at the end.)
I would replace the if-statement for GET with an else to the SET if-statement, and then assert that the command is GET. (This makes sure that we don't make a mistake where we let a command through and it falls through both if-statements and does nothing, not modifying the command.)

Here's the code in all it's glory:

 #include <assert.h>
 #include <string.h>
 #include "command.h"
 int parse_command(char * buffer, command* c) {
 char *delimiter = " ";
 char *token = strtok(buffer, delimiter);
 // We can allocate the char* pointers on the stack, since the length is
 // constant.
 char *arr[3] = {0};
 // Split the string into an array of strings
 for (int i = 0; token != NULL; i++) {
 // We make sure that if there are more than 3 arguments, we
 // don't overwrite memory after `arr`.
 if (i >= 3) return ERR_TOO_MANY_ARGUMENTS;
 token[strcspn(token, "\n")] = '0円'; // Remove trailing new lines
 // We can store a pointer to `token` (which is inside `buffer`)
 // instead of copying the string.
 // Note, however, that this means that `buffer` must outlive
 // `command`.
 arr[i] = token;
 token = strtok(NULL, delimiter); // Next token
 }
 // "Magic numbers", like arr[0], should be defined in a
 // macro or constant with a descriptive name.
 #define COMMAND (arr[0])
 #define KEY (arr[1])
 #define DATA (arr[2])
 // Redundant comparisons against 0, are, well, redundant.
 if (strcmp(COMMAND, "SET") && strcmp(COMMAND, "GET")) {
 return ERR_COMMAND_NOT_RECOGNIZED;
 }
 // Parse SET command
 if (!strcmp(COMMAND, "SET")) {
 if (KEY == NULL) return ERR_NO_KEY;
 // Validate key length
 if (strlen(KEY) > MAX_KEY_LENGTH) {
 return ERR_KEY_LENGTH;
 }
 // Validate data
 if (DATA == NULL) {
 return ERR_NO_DATA;
 }
 c->command_type = SET;
 c->key = KEY;
 c->data = DATA;
 } else {
 assert(!strcmp(COMMAND, "GET"));
 if (KEY == NULL) {
 return ERR_NO_KEY;
 }
 // Validate key length
 if (strlen(KEY) > MAX_KEY_LENGTH) {
 return ERR_KEY_LENGTH;
 }
 c->command_type = GET;
 c->key = KEY;
 }
 return 0;
 }

Question 3

parses a string into a command struct

Your use of testing is very effective, and isolating the parser from the rest of the system is good. However, a parser is difficult to test completely. You could use a parser-generator to have more confidence that your code is correct. Also, it's shorter and faster then building your own from the standard library. re2c has a similar example to parse an IPv4 address; I've successfully replaced your parser with this code.

#include <assert.h>
#include "../src/command.h"
/** Returns whether the command could be parsed. */
int parse_command(char *buffer, struct command *c) {
 char *YYCURSOR = buffer, *YYMARKER, *o1, *o2, *o3, *o4;
/*!stags:re2c format = 'char *@@;'; */
 assert(buffer && c);
scan:
/*!re2c
 // http://re2c.org/manual/manual_c.html#submatch-extraction
 re2c:yyfill:enable = 0;
 re2c:flags:tags = 1;
 re2c:define:YYCTYPE = char;
 end = "\x00";
 ws = [ \t\v\f\n\r];
 string = [^ \t\v\f\n\r\x00]{1,100};
 ws* { goto scan; } // skip blank lines and whitespace
 "GET" ws+ @o1 string @o2 ws* end {
 c->command_type = GET;
 c->key = o1, *o2 = '0円';
 c->data = 0;
 return 1;
 }
 "SET" ws+ @o1 string @o2 ws+ @o3 string @o4 ws* end {
 c->command_type = SET;
 c->key = o1, *o2 = '0円';
 c->data = o3, *o4 = '0円';
 return 1;
 }
 * { return 0; }
*/
}

I simplified the contract, instead of returning an error code, it returns success. Use re2c -o parse.c parse.c.re and modify the test programme. This also gets rid of the copying to dynamic memory that should probably be freed.

Question 4

First off, you could use constants for the command strings:

char COMMAND_SET[] = "SET";
char COMMAND_GET[] = "GET";

The processing of the two commands can be slightly generalized:

if (arr[1] == NULL) {
 return ERR_NO_KEY;
}
// Validate key length
if (strlen(arr[1]) > MAX_KEY_LENGTH) {
 return ERR_KEY_LENGTH;
}
c->key = arr[1];
// Parse SET command
if (strcmp(arr[0], "SET") == 0) {
 // Validate data
 if (arr[2] == NULL) {
 return ERR_NO_DATA;
 }
 c->command_type = SET;
 c->data = arr[2];
}
// Parse GET command
if (strcmp(arr[0], "GET") == 0) {
 c->command_type = GET;
}

I don't have a lot of experience with unit testing in C, so unfortunately I can't properly comment on that.

Question 5

Welcome to code review. I'm not sure of the value of using symbolic constants is in this case, since "SET" and "GET" are by default constants.

Question 6

Hi, Thank you so much for your response! In regards to code safety, is it fine? For example the usage of strcmp, or or calloc? Have I some bufferoverflow or other subtle bug around?

Simon Branch Simon BranchSimon Branch 862 bronze badges · Accepted Answer · 2020-08-13 03:22:29Z

You calloc arr, but you know the length of it is always 3. So you can instead allocate it on the stack using C's built in array type. The = {0} part means that all elements are set to all 0's (NULL's) at the start, which calloc does for you automatically.
```
 char *arr[3] = {0};
```
When you're splitting the input buffer on spaces, you try and write to arr even if you get more than three arguments. This is easily fixed by returning an error if i == 3 in the loop. (Exercise for the reader: set a flag instead of immediately returning so that if there are other errors, like if the command is unknown, you can return those instead. Or make the errors powers of two and return them OR'ed together, so that you can return multiple errors in one go.)
Instead of copying the string inside buffer, we can just point to that string in arr. Note, however, that this means that buffer must outlive the command. If that causes problems, just don't use this modification.

I would also make it clearer that i is only used inside that while loop, and rewrite it as a for loop.

 for (int i = 0; token != NULL; i++) {
 if (i >= 3) return ERR_TOO_MANY_ARGUMENTS; // A new error type for the .h file
 token[strcspn(token, "\n")] = '0円'; // Remove trailing new lines
 arr[i] = token;
 token = strtok(NULL, delimiter); // Next token
 }

You can replace all instances of arr[n] with more readable names by #defineing macros:
```
 #define COMMAND (arr[0])
 #define KEY (arr[1])
 #define DATA (arr[2])
```
You can replace strcmp(...) != 0 with strcmp(...) and strcmp(...) == 0 with !strcmp(...). (Don't worry, code with all the modifications will be posted at the end.)
I would replace the if-statement for GET with an else to the SET if-statement, and then assert that the command is GET. (This makes sure that we don't make a mistake where we let a command through and it falls through both if-statements and does nothing, not modifying the command.)

Here's the code in all it's glory:

 #include <assert.h>
 #include <string.h>
 #include "command.h"
 int parse_command(char * buffer, command* c) {
 char *delimiter = " ";
 char *token = strtok(buffer, delimiter);
 // We can allocate the char* pointers on the stack, since the length is
 // constant.
 char *arr[3] = {0};
 // Split the string into an array of strings
 for (int i = 0; token != NULL; i++) {
 // We make sure that if there are more than 3 arguments, we
 // don't overwrite memory after `arr`.
 if (i >= 3) return ERR_TOO_MANY_ARGUMENTS;
 token[strcspn(token, "\n")] = '0円'; // Remove trailing new lines
 // We can store a pointer to `token` (which is inside `buffer`)
 // instead of copying the string.
 // Note, however, that this means that `buffer` must outlive
 // `command`.
 arr[i] = token;
 token = strtok(NULL, delimiter); // Next token
 }
 // "Magic numbers", like arr[0], should be defined in a
 // macro or constant with a descriptive name.
 #define COMMAND (arr[0])
 #define KEY (arr[1])
 #define DATA (arr[2])
 // Redundant comparisons against 0, are, well, redundant.
 if (strcmp(COMMAND, "SET") && strcmp(COMMAND, "GET")) {
 return ERR_COMMAND_NOT_RECOGNIZED;
 }
 // Parse SET command
 if (!strcmp(COMMAND, "SET")) {
 if (KEY == NULL) return ERR_NO_KEY;
 // Validate key length
 if (strlen(KEY) > MAX_KEY_LENGTH) {
 return ERR_KEY_LENGTH;
 }
 // Validate data
 if (DATA == NULL) {
 return ERR_NO_DATA;
 }
 c->command_type = SET;
 c->key = KEY;
 c->data = DATA;
 } else {
 assert(!strcmp(COMMAND, "GET"));
 if (KEY == NULL) {
 return ERR_NO_KEY;
 }
 // Validate key length
 if (strlen(KEY) > MAX_KEY_LENGTH) {
 return ERR_KEY_LENGTH;
 }
 c->command_type = GET;
 c->key = KEY;
 }
 return 0;
 }

Stack Exchange Network

Parse string into a struct

3 Answers 3

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Hot Network Questions

Parse string into a struct

3 Answers 3

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Related

Hot Network Questions