Calculating Polynomial: Computing very large numbers emulating double using 2 floats

Question 1

I am working on a polynomial algorithm which requires calculations in large numbers, up to e+38. I am using a 32-bit system with compiler support of 32 bits for long/float/double. So far, by searching online, I've learned that single-precision floating points (FPs) can be used for double-precision FPs.

From this question asked by someone earlier (Emulate double using 2 floats), I found this paper which has the algorithm to work with double-precision FPs in GPUs. It is too confusing for me to implement in C. I just need four basic mathematical operations. Is there any way I could find an example for this which will help me understand it better?

Here is the code I am working on. It might have shortcomings I cannot see, but that is pretty much what I am trying to implement. In the algorithm, POLYNOMIAL_ORDER should be able to go up to fourth order (can settle at third order if the standard deviation is smaller). A few things I am not sure about are:

Whether procedures make_float() and make_float() are correct or not
Use of make_float() in the program

#define POLYNOMIAL_ORDER (3)
#define TC_TABLE_SIZE (14)
typedef struct vector_float2{
float x;
float y;
}float2;
typedef struct
{
 float tc0;
 float tc1;
 float tc2;
 float tc3;
}POLYNOMIALS;
typedef struct {
 int16_t Temp;
 int16_t Comp; 
} TempCompPair;
volatile TempCompPair TCtable[TC_TABLE_SIZE] = {{22452,1651},
 {25318,1444},
 {28268,1133},
 {31120,822},
 {34027,511},
 {36932,185},
 {39770,-81},
 {42685,-288},
 {45531,-407},
 {48425,-632},
 {51401,-703},
 {54460,-1143},
 {57202,-1420},
 {60027,-1652}};
POLYNOMIALS polynomials;
float matrix[TC_TABLE_SIZE][TC_TABLE_SIZE] = {0};
float average[TC_TABLE_SIZE] = {0};
float make_float(float x, float y)
{
 return x+y;
}
float2 make_float2(float a, float b)
{
 float2 f2 = {a,b};
 return f2;
}
float2 quickTwoSum(float a, float b)
{
 float s = a+b;
 float e = b - (s - a);
 float2 result = {s, e};
 return result;
}
float2 twoSum(float a, float b)
{
 volatile float s = a + b;
 float v = s - a;
 float e = (a - (s - v)) + (b - v);
 float2 result = {s , e};
 return result;
}
float2 df64_add(float2 a, float2 b)
{
 float2 s,t;
 s = twoSum(a.x, b.x);
 t = twoSum(a.y, b.y);
 s.y += t.x;
 s = quickTwoSum(s.x, s.y);
 s.y += t.y;
 s = quickTwoSum(s.x, s.y);
 return s;
}
float2 split(float a)
{
 const float split = 4097; //(1<<12) + 1
 float t = a *split;
 float a_hi = t - (t - a);
 float a_lo = a - a_hi;
 float2 result = {a_hi, a_lo};
 return result;
}
float2 twoProd(float a, float b)
{
 float p = a*b;
 float2 aS = split(a);
 float2 bS = split(b);
 float err = ((aS.x * bS.x - p)
 + aS.x * bS.y + aS.y * bS.x)
 + aS.y * bS.y;
 float2 result = {p, err};
 return result;
}
float2 df64_mult(float2 a, float2 b)
{
 float2 p;
 p = twoProd(a.x,b.x);
 p.y += a.x * b.y;
 p.y += a.y * b.x;
 p = quickTwoSum(p.x,p.y);
 return p;
}
float2 calculate_power(float base, int pow)
{
 int i = 0;
 float2 base_f2 = make_float2(base,0);
 float2 result_f2 = {1,0};
 if(pow == 0)
 {
 return result_f2;
 }
 if(pow > 0)
 {
 if(pow == 1)
 {
 return base_f2;
 }
 else
 {
 for(i = 0; i < pow; i++)
 {
 result_f2 = df64_mult(result_f2,base_f2);
 }
 return result_f2;
 }
 }
 else
 {
 return result_f2;
 //Mechanism for negative powers
 }
}
void TComp_Polynomial()
{
 int i;
 int j;
 int k;
 int size;
 float temp;
 float2 sum = {0,0};
 float2 result0 = {0,0};
 float2 result1 = {0,0};
 float x[TC_TABLE_SIZE];
 float y[TC_TABLE_SIZE];
 for(i = 0; i < TC_TABLE_SIZE; i++)
 {
 x[i] = (float) TCtable[i].Temp;
 y[i] = (float) TCtable[i].Comp;
 }
 size = i;
 for(i = 0; i <= POLYNOMIAL_ORDER; i++)
 {
 for(j = 0; j <= POLYNOMIAL_ORDER; j++)
 {
 sum.x = 0;
 sum.y = 0;
 for(k = 0; k < size; k++)
 {
 // Expression simplified below: **sum += pow(x[k],i+j)** 
 result0 = calculate_power(x[k], i+j);
 sum = df64_add(result0,sum);
 }
 matrix[i][j] = make_float(sum.x,sum.y);
 }
 }
 for(i = 0; i <= POLYNOMIAL_ORDER; i++)
 {
 sum.x = 0;
 sum.y = 0;
 for(j = 0; j < size; j++)
 {
 // Expression simplified below: **sum += y[j] * pow(x[j],i)**
 result0 = calculate_power(x[j], i);
 result1 = df64_mult( result0 , make_float2(y[j],0) );
 sum = df64_add(result1,sum);
 }
 average[i] = make_float(sum.x,sum.y);
 }
 for(i = 0; i <= POLYNOMIAL_ORDER; i++)
 {
 for(j = 0; j <= POLYNOMIAL_ORDER; j++)
 {
 if(j != i)
 {
 if(matrix[i][i]!= 0)
 {
 temp = matrix[j][i]/matrix[i][i];
 }
 for(k = i; k < POLYNOMIAL_ORDER; k++)
 {
 matrix[j][k] -= temp*matrix[i][k];
 }
 average[j] -= temp*average[i];
 }
 }
 }
 if(matrix[0][0] != 0)
 {
 polynomials.tc0 = average[0]/matrix[0][0];
 }
 if(matrix[1][1] != 0)
 {
 polynomials.tc1 = average[1]/matrix[1][1];
 }
 if(matrix[2][2] != 0)
 {
 polynomials.tc2 = average[2]/matrix[2][2];
 }
 if(matrix[3][3] != 0)
 {
 polynomials.tc3 = average[3]/matrix[3][3];
 }
}

and then use the struct polynomials.tc0/1/2/3 in this expression:

// y = x^3 * d + x^2 * c + x^1 * b + a ;
double calculate_equation(uint16_t TEMP)
{
 double Y;
 if(POLYNOMIAL_ORDER == 1)
 {
 Y = polynomials.tc1*(double)TEMP + polynomials.tc0; 
 }
 else if(POLYNOMIAL_ORDER == 2)
 {
 Y = (polynomials.tc2 * (double)TEMP + polynomials.tc1)*(double)TEMP + polynomials.tc0; 
 }
 else if(POLYNOMIAL_ORDER == 3)
 {
 Y = ((polynomials.tc3 * (double)TEMP + polynomials.tc2)*(double)TEMP + polynomials.tc1)*(double)TEMP + polynomials.tc0; 
 }
 else if(POLYNOMIAL_ORDER == 4)
 {
 Y = (((polynomials.tc4 * (double)TEMP + polynomials.tc3)*(double)TEMP + polynomials.tc2)*(double)TEMP + polynomials.tc1)*(double)TEMP + polynomials.tc0; 
 }
 return Y;
}

And standard deviation is calculated as follows:

//sqrt(sigma(error^2))
 for(i = 0; i < TC_TABLE_SIZE; i++)
 {
 actual_comp[i] =(int) calculate_equation(TCtable[i].Temp);
 error[i] = TCtable[i].Comp - actual_comp[i] ;
 error_sqr += error[i]*error[i];
 printf("%u\t%d\t\t%e\n", TCtable[i].Temp, TCtable[i].Comp, actual_comp[i] );
 }
 error_sqrt = sqrt(error_sqr);

Reference

Question 2

you might want to look at the header file: bignum.h

Question 3

@user3629249: I have limitations on the code size and also on the use of open-source libraries. Thanks for the suggestions though.

Question 4

Yes, Unfortunately I am stuck with an old compiler from Texas Instrument for MSP430f2618 chip with compiler TI v4.0.0.B1. This is the best bet for me. I can change the compiler to updated one with an expanse of changing BSL and lot of firmware, requires a lot of time.

Question 5

(Do you know about the signal processing exchange?)

Question 6

Your best bet may be doing floating point "manually", using one word for exponent and three (storage?) or four (computation/accumulation) for mantissa. (Rolling your own, you can use silly/smart tricks like normalizing to word/byte boundaries or, with polynomials, choose to use fewer words of the operands where the product is known to be small.

Question 7

I was able to implement this code without using double precision as the calculations were in the range of Float. Here's my implementation, let me know if I can optimize it better.

typedef struct
{ int64_t tc0;
 int64_t tc1;
 int64_t tc2;
 int64_t tc3;
 int64_t tc4;
}POLYNOMIALS;
POLYNOMIALS polynomials = {0,0,0,0,0};
int16_t TempCompIndex;
int64_t x[TC_TABLE_SIZE];
int64_t y[TC_TABLE_SIZE];
float matrix[POLYNOMIAL_ORDER+1][POLYNOMIAL_ORDER+1] = {0};
float average[POLYNOMIAL_ORDER+1] = {0};
void TComp_Polynomial()
{
 int i;
 int j;
 int k;
 int size;
 float temp;
 float sum = 0;
 float powr = 0;
 float prod;
 int64_t x[TC_TABLE_SIZE];
 int64_t y[TC_TABLE_SIZE];
 for(i = 0; i < TC_TABLE_SIZE; i++)
 {
 x[i] = (int64_t) TCtable[i].Temp;
 y[i] = (int64_t) TCtable[i].Comp<<PRECISION;
 printf("x: %lld, y:%lld\n",x[i],y[i]);
 }
 size = i;
 for(i = 0; i <= POLYNOMIAL_ORDER; i++)
 {
 for(j = 0; j <= POLYNOMIAL_ORDER; j++)
 {
 sum = 0;
 powr = 0;
 for(k = 0; k < size; k++)
 { 
 //printf("x[%d]: %ld, i: %d ,j: %d ", k, x[k],i,j);
 powr = pow(x[k],i+j);
 //printf("Power: %f, sum: %f\n ",powr,sum);
 sum += powr;
 //printf("%f\r\n",powr);
 //printf("sum: %lf\n",sum );
 }
 matrix[i][j] = sum;
 printf("sum: %g\n",sum);
 }
 }
 for(i = 0; i <= POLYNOMIAL_ORDER; i++)
 {
 sum = 0;
 powr = 0;
 for(j = 0; j < size; j++)
 {
 //sum += y[j] * pow(x[j],i)
 //printf("sum: %lf, y[%d]: %lf, x[%d]: %lf^%d ",sum,j,y[j], i, x[j],i);
 //printf("x[%d]:%lld ^ %d\t",j,x[j],i);
 powr = (float) pow(x[j],i);
 printf("powr: %f\t",powr);
 prod = (float) y[j] * powr;
 printf("prod:%f \t %lld \t", prod,y[j]);
 sum += (float) prod;
 printf("sum: %f \n",sum);
 }
 average[i] = sum;
 //printf("#Avg: %f\n",average[i]);
 }
 printf("\n\n");
 for(i = 0; i <= POLYNOMIAL_ORDER; i++)
 {
 for(j = 0; j <= POLYNOMIAL_ORDER; j++)
 {
 if(j != i)
 { 
 if(matrix[i][i]!= 0)
 {
 //printf("matrix%d%d: %g / matrix%d%d: %g =\t ",j,i,matrix[j][i],i,i,matrix[i][i]);
 temp = matrix[j][i]/matrix[i][i];
 //printf("Temp: %g\n",temp);
 } 
 for(k = i; k < POLYNOMIAL_ORDER; k++)
 { 
 matrix[j][k] -= temp*matrix[i][k];
 //printf("matrix[%d][%d]:%g, %g, matrix[%d][%d]:%g\n",j,k,matrix[j][k], temp,i,k,matrix[i][k]);
 }
 //printf("\n\n");
 //print_matrix();
 printf("\n\n");
 //printf("avg%d: %g\ttemp: %g\tavg%d: %g\n\n",j,average[j],temp,i,average[i]);
 average[j] -= temp*average[i];
 printf("#Avg%d:%g\n",j,average[j]);
 //print_average();
 }
 }
 }
 print_matrix();
 print_average();
/* Calculate polynomial Coefficients (n+1) based on the POLYNOMIAL_ORDER (n) */
#ifndef POLYNOMIAL_ORDER
#elif POLYNOMIAL_ORDER == 0
 if(matrix[0][0] != 0)
 {
 polynomials.tc0 = (int64_t) (average[0]/matrix[0][0]);
 }
#elif POLYNOMIAL_ORDER == 1
 if(matrix[1][1] != 0)
 {
 polynomials.tc0 = (int64_t) (average[0]/matrix[0][0]);
 polynomials.tc1 = (int64_t) (average[1]/matrix[1][1]);
 }
#elif POLYNOMIAL_ORDER == 2
 if(matrix[2][2] != 0)
 {
 polynomials.tc0 = (int64_t) (average[0]/matrix[0][0]);
 polynomials.tc1 = (int64_t) (average[1]/matrix[1][1]);
 polynomials.tc2 = (int64_t) (average[2]/matrix[2][2]);
 }
#elif POLYNOMIAL_ORDER == 3
 if(matrix[3][3] != 0)
 {
 polynomials.tc0 = (int64_t) (average[0]/matrix[0][0]);
 polynomials.tc1 = (int64_t) (average[1]/matrix[1][1]);
 polynomials.tc2 = (int64_t) (average[2]/matrix[2][2]);
 polynomials.tc3 = (int64_t) (average[3]/matrix[3][3]);
 }
#elif POLYNOMIAL_ORDER == 4
 if(matrix[4][4] != 0)
 {
 polynomials.tc0 = (int64_t) (average[0]/matrix[0][0]);
 polynomials.tc1 = (int64_t) (average[1]/matrix[1][1]);
 polynomials.tc2 = (int64_t) (average[2]/matrix[2][2]);
 polynomials.tc3 = (int64_t) (average[3]/matrix[3][3]);
 polynomials.tc4 = (int64_t) (average[4]/matrix[4][4]);
 }
#endif
 }
 int16_t calculate_equation(uint16_t TEMP)
{
 int64_t Y = 0;
 int16_t TempComp = 0;
#ifndef POLYNOMIAL_ORDER
#elif POLYNOMIAL_ORDER == 0
 Y = polynomials.tc0;
#elif POLYNOMIAL_ORDER == 1
 Y = polynomials.tc1* ((int64_t)TEMP) + polynomials.tc0;
#elif POLYNOMIAL_ORDER == 2
 Y = (polynomials.tc2 * ((int64_t)TEMP) + polynomials.tc1)*(int64_t)TEMP + polynomials.tc0;
#elif POLYNOMIAL_ORDER == 3
 Y = ((polynomials.tc3 * ((int64_t)TEMP) + polynomials.tc2)*((int64_t)TEMP) + polynomials.tc1)*((int64_t)TEMP) + polynomials.tc0;
#elif POLYNOMIAL_ORDER == 4
 Y = (((polynomials.tc4 * (int64_t)TEMP + polynomials.tc3)*(int64_t)TEMP + polynomials.tc2)*(int64_t)TEMP + polynomials.tc1)*(int64_t)TEMP + polynomials.tc0;
#endif
 TempComp = (int16_t) (Y>>PRECISION_BITS);
 return TempComp;
}
void main(){
int16_t TempComp = 0;
TempCompValue = (int16_t) calculate_equation(Mon_Temp);
}

Note: Calculate_Equation() is being called once a second and it is required to not use float in order to avoid floating point arithmetic, hence I am using non-float variables in that function.

It is working right for me and haven't discovered any bug after initial testing. Thanks every one for taking interest in my post, if not the answer, got to learn some new techniques. And thanks @chux.

Question 8

(I think it better to post a (cross-linked) follow-up question.) Don't use #if ID == 0 s0 #elif ID == 1 s0 s1 #elif ID == 2 s0 s1 s2 ... - use #if ID <= 0 s0 #if ID <= 1 s1 #if ID <= 2 s2 ...# annoying_amount_of_ endif.

Question 9

@greybeard: Thanks for the suggestion. Updated in my code.

Rishit Rishit 615 bronze badges · Answer 1 · 2017-08-07 20:19:52Z

I was able to implement this code without using double precision as the calculations were in the range of Float. Here's my implementation, let me know if I can optimize it better.

typedef struct
{ int64_t tc0;
 int64_t tc1;
 int64_t tc2;
 int64_t tc3;
 int64_t tc4;
}POLYNOMIALS;
POLYNOMIALS polynomials = {0,0,0,0,0};
int16_t TempCompIndex;
int64_t x[TC_TABLE_SIZE];
int64_t y[TC_TABLE_SIZE];
float matrix[POLYNOMIAL_ORDER+1][POLYNOMIAL_ORDER+1] = {0};
float average[POLYNOMIAL_ORDER+1] = {0};
void TComp_Polynomial()
{
 int i;
 int j;
 int k;
 int size;
 float temp;
 float sum = 0;
 float powr = 0;
 float prod;
 int64_t x[TC_TABLE_SIZE];
 int64_t y[TC_TABLE_SIZE];
 for(i = 0; i < TC_TABLE_SIZE; i++)
 {
 x[i] = (int64_t) TCtable[i].Temp;
 y[i] = (int64_t) TCtable[i].Comp<<PRECISION;
 printf("x: %lld, y:%lld\n",x[i],y[i]);
 }
 size = i;
 for(i = 0; i <= POLYNOMIAL_ORDER; i++)
 {
 for(j = 0; j <= POLYNOMIAL_ORDER; j++)
 {
 sum = 0;
 powr = 0;
 for(k = 0; k < size; k++)
 { 
 //printf("x[%d]: %ld, i: %d ,j: %d ", k, x[k],i,j);
 powr = pow(x[k],i+j);
 //printf("Power: %f, sum: %f\n ",powr,sum);
 sum += powr;
 //printf("%f\r\n",powr);
 //printf("sum: %lf\n",sum );
 }
 matrix[i][j] = sum;
 printf("sum: %g\n",sum);
 }
 }
 for(i = 0; i <= POLYNOMIAL_ORDER; i++)
 {
 sum = 0;
 powr = 0;
 for(j = 0; j < size; j++)
 {
 //sum += y[j] * pow(x[j],i)
 //printf("sum: %lf, y[%d]: %lf, x[%d]: %lf^%d ",sum,j,y[j], i, x[j],i);
 //printf("x[%d]:%lld ^ %d\t",j,x[j],i);
 powr = (float) pow(x[j],i);
 printf("powr: %f\t",powr);
 prod = (float) y[j] * powr;
 printf("prod:%f \t %lld \t", prod,y[j]);
 sum += (float) prod;
 printf("sum: %f \n",sum);
 }
 average[i] = sum;
 //printf("#Avg: %f\n",average[i]);
 }
 printf("\n\n");
 for(i = 0; i <= POLYNOMIAL_ORDER; i++)
 {
 for(j = 0; j <= POLYNOMIAL_ORDER; j++)
 {
 if(j != i)
 { 
 if(matrix[i][i]!= 0)
 {
 //printf("matrix%d%d: %g / matrix%d%d: %g =\t ",j,i,matrix[j][i],i,i,matrix[i][i]);
 temp = matrix[j][i]/matrix[i][i];
 //printf("Temp: %g\n",temp);
 } 
 for(k = i; k < POLYNOMIAL_ORDER; k++)
 { 
 matrix[j][k] -= temp*matrix[i][k];
 //printf("matrix[%d][%d]:%g, %g, matrix[%d][%d]:%g\n",j,k,matrix[j][k], temp,i,k,matrix[i][k]);
 }
 //printf("\n\n");
 //print_matrix();
 printf("\n\n");
 //printf("avg%d: %g\ttemp: %g\tavg%d: %g\n\n",j,average[j],temp,i,average[i]);
 average[j] -= temp*average[i];
 printf("#Avg%d:%g\n",j,average[j]);
 //print_average();
 }
 }
 }
 print_matrix();
 print_average();
/* Calculate polynomial Coefficients (n+1) based on the POLYNOMIAL_ORDER (n) */
#ifndef POLYNOMIAL_ORDER
#elif POLYNOMIAL_ORDER == 0
 if(matrix[0][0] != 0)
 {
 polynomials.tc0 = (int64_t) (average[0]/matrix[0][0]);
 }
#elif POLYNOMIAL_ORDER == 1
 if(matrix[1][1] != 0)
 {
 polynomials.tc0 = (int64_t) (average[0]/matrix[0][0]);
 polynomials.tc1 = (int64_t) (average[1]/matrix[1][1]);
 }
#elif POLYNOMIAL_ORDER == 2
 if(matrix[2][2] != 0)
 {
 polynomials.tc0 = (int64_t) (average[0]/matrix[0][0]);
 polynomials.tc1 = (int64_t) (average[1]/matrix[1][1]);
 polynomials.tc2 = (int64_t) (average[2]/matrix[2][2]);
 }
#elif POLYNOMIAL_ORDER == 3
 if(matrix[3][3] != 0)
 {
 polynomials.tc0 = (int64_t) (average[0]/matrix[0][0]);
 polynomials.tc1 = (int64_t) (average[1]/matrix[1][1]);
 polynomials.tc2 = (int64_t) (average[2]/matrix[2][2]);
 polynomials.tc3 = (int64_t) (average[3]/matrix[3][3]);
 }
#elif POLYNOMIAL_ORDER == 4
 if(matrix[4][4] != 0)
 {
 polynomials.tc0 = (int64_t) (average[0]/matrix[0][0]);
 polynomials.tc1 = (int64_t) (average[1]/matrix[1][1]);
 polynomials.tc2 = (int64_t) (average[2]/matrix[2][2]);
 polynomials.tc3 = (int64_t) (average[3]/matrix[3][3]);
 polynomials.tc4 = (int64_t) (average[4]/matrix[4][4]);
 }
#endif
 }
 int16_t calculate_equation(uint16_t TEMP)
{
 int64_t Y = 0;
 int16_t TempComp = 0;
#ifndef POLYNOMIAL_ORDER
#elif POLYNOMIAL_ORDER == 0
 Y = polynomials.tc0;
#elif POLYNOMIAL_ORDER == 1
 Y = polynomials.tc1* ((int64_t)TEMP) + polynomials.tc0;
#elif POLYNOMIAL_ORDER == 2
 Y = (polynomials.tc2 * ((int64_t)TEMP) + polynomials.tc1)*(int64_t)TEMP + polynomials.tc0;
#elif POLYNOMIAL_ORDER == 3
 Y = ((polynomials.tc3 * ((int64_t)TEMP) + polynomials.tc2)*((int64_t)TEMP) + polynomials.tc1)*((int64_t)TEMP) + polynomials.tc0;
#elif POLYNOMIAL_ORDER == 4
 Y = (((polynomials.tc4 * (int64_t)TEMP + polynomials.tc3)*(int64_t)TEMP + polynomials.tc2)*(int64_t)TEMP + polynomials.tc1)*(int64_t)TEMP + polynomials.tc0;
#endif
 TempComp = (int16_t) (Y>>PRECISION_BITS);
 return TempComp;
}
void main(){
int16_t TempComp = 0;
TempCompValue = (int16_t) calculate_equation(Mon_Temp);
}

Note: Calculate_Equation() is being called once a second and it is required to not use float in order to avoid floating point arithmetic, hence I am using non-float variables in that function.

It is working right for me and haven't discovered any bug after initial testing. Thanks every one for taking interest in my post, if not the answer, got to learn some new techniques. And thanks @chux.

(I think it better to post a (cross-linked) follow-up question.) Don't use #if ID == 0 s0 #elif ID == 1 s0 s1 #elif ID == 2 s0 s1 s2 ... - use #if ID <= 0 s0 #if ID <= 1 s1 #if ID <= 2 s2 ...# annoying_amount_of_ endif.

Stack Exchange Network

Calculating Polynomial: Computing very large numbers emulating double using 2 floats

1 Answer 1

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Hot Network Questions

Calculating Polynomial: Computing very large numbers emulating double using 2 floats

1 Answer 1

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Related

Hot Network Questions