The purpose of this code is to parse any expression as could appear in C that involves doubles or functions that involve only doubles.
I have written parsers before:
This is an upgraded version that uses more descriptive variable names and structures instead of parallel arrays. This version can also handle functions of varying arity. I was even planning on doing short string optimization here but had to skip that because reasons.
#include <math.h>
#include <ctype.h>
#include <string.h>
#include <stdlib.h>
typedef struct Symbol {
unsigned arity, len;
const char *name;
union {
double
func0,
(*func1)(double),
(*func2)(double, double),
(*func3)(double, double, double);
};
} Symbol;
static int compar(const void *const restrict strp, const void *const restrict p) {
const unsigned len = ((const Symbol *)p)->len;
const char
*const str = *(const char *const *)strp,
*const name = ((const Symbol *)p)->name;
const int cmp = memcmp(str, name, len);
return cmp
? cmp
: isalnum((unsigned char)str[len]) || str[len] == '_'
? str[len]-name[len]
: (*(const char **)strp += len, 0);
}
#define SYMBOL(a, n)\
{\
.arity = a,\
.len = sizeof(#n)-1,\
.name = #n,\
.func##a = n\
}
static const Symbol symbols[] = {
SYMBOL(0, M_1_PI),
SYMBOL(0, M_2_PI),
SYMBOL(0, M_2_SQRTPI),
SYMBOL(0, M_E),
SYMBOL(0, M_LN10),
SYMBOL(0, M_LN2),
SYMBOL(0, M_LOG10E),
SYMBOL(0, M_LOG2E),
SYMBOL(0, M_PI),
SYMBOL(0, M_PI_2),
SYMBOL(0, M_PI_4),
SYMBOL(0, M_SQRT1_2),
SYMBOL(0, M_SQRT2),
SYMBOL(1, acos),
SYMBOL(1, acosh),
SYMBOL(1, asin),
SYMBOL(1, asinh),
SYMBOL(1, atan),
SYMBOL(2, atan2),
SYMBOL(1, atanh),
SYMBOL(1, cbrt),
SYMBOL(1, ceil),
SYMBOL(1, cos),
SYMBOL(1, cosh),
SYMBOL(2, copysign),
SYMBOL(1, erf),
SYMBOL(1, erfc),
SYMBOL(1, exp),
SYMBOL(1, exp2),
SYMBOL(1, expm1),
SYMBOL(1, fabs),
SYMBOL(2, fdim),
SYMBOL(1, floor),
SYMBOL(3, fma),
SYMBOL(2, fmax),
SYMBOL(2, fmin),
SYMBOL(2, fmod),
SYMBOL(2, hypot),
SYMBOL(1, lgamma),
SYMBOL(1, log),
SYMBOL(1, log10),
SYMBOL(1, log1p),
SYMBOL(1, log2),
SYMBOL(1, logb),
SYMBOL(2, nextafter),
SYMBOL(2, pow),
SYMBOL(2, remainder),
SYMBOL(1, round),
SYMBOL(1, sin),
SYMBOL(1, sinh),
SYMBOL(1, sqrt),
SYMBOL(1, tan),
SYMBOL(1, tanh),
SYMBOL(1, tgamma),
SYMBOL(1, trunc)
};
static double term(const char **);
static double expr(const char **const str, const unsigned level) {
*str += level != 0;
for (double val = term(str);;) {
while (isspace((unsigned char)**str))
++*str;
switch (**str) {
case '*':
if (level > 2)
break;
val *= expr(str, 2);
continue;
case '/':
if (level > 2)
break;
val /= expr(str, 2);
continue;
case '+':
if (level > 1)
break;
val += expr(str, 1);
continue;
case '-':
if (level > 1)
break;
val -= expr(str, 1);
continue;
}
return val;
}
}
static double term(const char **const str) {
while (isspace((unsigned char)**str))
++*str;
{
char *end;
const double val = strtod(*str, &end);
if (*str != end) {
*str = end;
return val;
}
}
if (isalpha((unsigned char)**str)) {
const Symbol *const symbol = bsearch(str, symbols, sizeof(symbols)/sizeof(Symbol), sizeof(Symbol), compar);
if (symbol) {
if (!symbol->arity)
return symbol->func0;
while (isspace((unsigned char)**str))
++*str;
if (*(*str)++ == '(') {
double vals[symbol->arity];
for (unsigned i = 0;;) {
vals[i] = expr(str, 0);
if (++i >= symbol->arity) {
if (*(*str)++ == ')') switch (symbol->arity) {
case 1:
return symbol->func1(vals[0]);
case 2:
return symbol->func2(vals[0], vals[1]);
case 3:
return symbol->func3(vals[0], vals[1], vals[2]);
}
} else if (*(*str)++ == ',')
continue;
break;
}
}
}
} else switch (*(*str)++) {
case '+':
return term(str);
case '-':
return -term(str);
case '(':
{
const double val = expr(str, 0);
if (*(*str)++ == ')')
return val;
}
}
return NAN;
}
#include <stdio.h>
int main(int argc, char **argv) {
for (int arg = 1; arg < argc; ++arg)
if (printf("%g\n", expr(&(const char *){argv[arg]}, 0)) < 0)
return EXIT_FAILURE;
return EXIT_SUCCESS;
}
1 Answer 1
Bug
memcmp(str, name, len);
lets memcmp()
compare the first len
bytes of the data pointed to by str
and name
, yet some of those are pointers to a string less than len
in size. What code needs is strncmp()
(or perhaps even strcmp()
) to not compare past the end of the string.
Bug
The *(const char **)strp += len
relies on bsearch()
to stop calling cmp()
once a return value of 0 occurs. Although this is common, bsearch()
is not specified that way. Instead the calling code should advance the pointer, not the compare function.
Pedantic Bug
str[len]-name[len]
in compar()
should be an unsigned char
compare to well sort symbols that have a character outside the [0...CHAR_MAX]
range. Yet since, presently, symbols[]
only employs ASCII characters, it does not make a difference.
Non-standard macros
M_PI
and its 13 friends are not part of the C standard. See Using M_PI. Consider
#ifndef M_PI
#define M_PI 3.1415926535897932384626433832795
#endif
Lack of comments
Functions deserve at least a little description to indicate its role.
Consider long double
Consider using the most precise/widest range floating point type available.
Use a more precise output
Rather than only 6 digits, use the precision of the type:
// printf("%g\n", some_double)
printf("%.*g\n", DBL_DIG, some_double)
Missing break;
case '(': { ..
deserves a break;
at the end of the case or a comment indicating fall-through was desired.
double expr()
missing return value;
TBD code needed to quiet "Problem description: No return, in function returning non-void".
Minor:
Excessive use of const object
object
would simplify code presentation in many places where there is only a few lines of code.
Example:
// const double val = expr(str, 0);
double val = expr(str, 0);
if (*(*str)++ == ')')
return val;
Format to presentation width
Use an auto-formatter and present to the display width (e.g. ~82)
1 2 3 4 5 6 7 8 9
1234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890
// const Symbol *const symbol = bsearch(str, symbols, sizeof(symbols)/sizeof(Symbol), sizeof(Symbol), compar);
const Symbol *const symbol = bsearch(str, symbols,
sizeof(symbols)/sizeof(Symbol), sizeof(Symbol), compar);
Note the ()
are not needed with sizeof
object.
Avoid names that differ only in case
// v----v v----v
const Symbol *const symbol
Vertical spacing
Between functions, types, consider a blank line.
-
\$\begingroup\$ Oh, I tried to eliminate the excessively long lines, but looks like I missed one. \$\endgroup\$CPlus– CPlus2023年10月16日 21:40:59 +00:00Commented Oct 16, 2023 at 21:40
-
\$\begingroup\$ @user16217248 Tip: use an auto-formatter. Do not manually format code. \$\endgroup\$chux– chux2023年10月16日 21:42:36 +00:00Commented Oct 16, 2023 at 21:42
-
\$\begingroup\$ I'm not quite sure what you mean by making
str[len]-name[len]
anunsigned char
. You mean like(unsigned char)(str[len]-name[len])
? I don't know what that solves. Also thedouble expr()
misses areturn
at the end because the 'end' is not reachable. The loop runs forever until the return statement at the end of the loop body is reached. \$\endgroup\$CPlus– CPlus2023年10月16日 23:14:32 +00:00Commented Oct 16, 2023 at 23:14 -
\$\begingroup\$ @user16217248 When
char
is signed,str[len]
is of one sign (say +50) andname[len]
is the other sign (say -60),str[len]-name[len]
is positive (110), yet if the strings are accessed viaunsigned char *
,str[len]-name[len]
is like50 - (256-60)
with a negative result. Which is correct for searching a sorted list?strcmp()
andmemcmp()
perform as if in the 2nd case. You code does not experience this issue as the functions all have ASCII names. Yet if π was added to to the calculator, we may have an issue. \$\endgroup\$chux– chux2023年10月17日 01:28:01 +00:00Commented Oct 17, 2023 at 1:28 -
1\$\begingroup\$ Well I could replace the
return
at the end of the loop withbreak
and just put thereturn
at the end of the function. \$\endgroup\$CPlus– CPlus2023年10月17日 02:08:23 +00:00Commented Oct 17, 2023 at 2:08