1
\$\begingroup\$

My code seems to work fine, but I'm interested in how I can make it faster. I really just want to reduce any wasteful computation (for instance eta_m = eta / (double)m reduces coef[j] -= eta * gradient_coef[j] / (double)m to coef[j] -= eta_m * gradient_coef[j]). I realize a good BLAS implementation would probably be useful, but I've found that complicates usage on other systems (unless there's some generalized implementation out there).

I've tried to take advantage of some linear properties (where able) to reduce the number of loops performed, though the initial loop that sets coefficients and gradients to 0.0 could probably be wrapped into the main loops using if statements. The alpha and beta declarations are vestigial from using BLAS but can't imagine they're very impactful. Anything glaringly inefficient that I'm missing here?

#include <stdlib.h>
#include <math.h>
// Compute z = w * x + b
double dlc( unsigned long n, double *X, double *coef, double intercept )
{
 double z = intercept;
 for ( unsigned long j = 0; j < n; j++ )
 {
 z += X[j] * coef[j];
 }
 return z;
}
// Compute y_hat = 1 / (1 + e^(-z))
double dsigmoid( unsigned long n, double alpha, double *X, double *coef, double beta, double intercept )
{
 //double z = intercept;
 //bli_ddotxv( BLIS_NO_CONJUGATE, BLIS_NO_CONJUGATE, n, &alpha, X, 1, coef, 1, &beta, &z );
 double z = dlc(n, X, coef, intercept);
 if ( z >= 0)
 {
 return 1.0 / (1.0 + exp(-z));
 }
 else
 {
 return exp(z) / (1.0 + exp(z));
 }
}
// Gradient descent
void dgd( unsigned long m, unsigned long n, double *X, double *y, double *coef, double *intercept, double eta, int max_iter, int fit_intercept )
{
 double alpha = 1.0, beta = 1.0, gradient_intercept = 0.0, eta_m;
 double *y_pred = (double *) malloc (m * sizeof(double));
 double *resid = (double *) malloc (m * sizeof(double));
 double *gradient_coef = (double *) malloc (n * sizeof(double));
 for ( unsigned long j = 0; j < n; j++ )
 {
 coef[j] = 0.0;
 gradient_coef[j] = 0.0;
 }
 *intercept = 0.0;
 eta_m = eta / (double)m;
 
 for ( int epoch = 0; epoch < max_iter; epoch++ )
 {
 // Compute y_hat and gradients
 for ( unsigned long i = 0; i < m; i++ )
 {
 y_pred[i] = dsigmoid( n, alpha, &X[n*i], coef, beta, *intercept );
 resid[i] = y[i] - y_pred[i];
 for ( unsigned long j = 0; j < n; j++ )
 {
 gradient_coef[j] -= (X[n*i + j] * resid[i]);
 }
 if ( fit_intercept == 1 )
 {
 gradient_intercept -= resid[i];
 }
 }
 // Adjust weights
 for ( unsigned long j = 0; j < n; j++ )
 {
 coef[j] -= eta_m * gradient_coef[j];
 gradient_coef[j] = 0.0;
 }
 if ( fit_intercept == 1 )
 {
 *intercept -= eta_m * gradient_intercept;
 gradient_intercept = 0.0;
 }
 }
 free(y_pred);
 free(resid);
 free(gradient_coef);
}

Edit: added code for checking speed of execution.

int main(void)
{
 double *X, *y, *coef, *y_preds;
 double intercept, eta = 0.5;
 double alpha = 1.0, beta = 1.0;
 unsigned long m = 100000;
 unsigned long n = 20;
 int max_iter = 250;
 int class_0 = (unsigned long)(3.0 / 4.0 * (double)m);
 double pct_class_1 = 0.0;
 clock_t test_start;
 clock_t test_end;
 double test_time;
 printf("Constructing variables...\n");
 X = (double *) malloc (m * n * sizeof(double));
 y = (double *) malloc (m * sizeof(double));
 y_preds = (double *) malloc (m * sizeof(double));
 coef = (double *) malloc (n * sizeof(double));
 // Initialize classes
 for ( unsigned long i = 0; i < m; i++ )
 {
 if (i < class_0)
 {
 y[i] = 0.0;
 }
 else
 {
 y[i] = 1.0;
 }
 /*
 // Troubleshooting print
 if (i < 10 || i > m - 10)
 {
 //printf("%f\n", y[i]);
 }
 */
 }
 // Initialize observation features
 for ( unsigned long i = 0; i < m; i++ )
 {
 if (i < class_0)
 {
 X[n*i] = 1.0 / (double)m;
 }
 else
 {
 X[n*i] = (double)i / (double)m;
 }
 X[n*i + 1] = (double)i / (double)m;
 for ( unsigned long j = 2; j < n; j++ )
 {
 X[n*i + j] = (double)rand() / (double)RAND_MAX;
 }
 /*
 // Troubleshooting print
 if (i < 10)
 {
 //printf("%f\t%f\n", X[n*i], X[n*i+1]);
 }
 */
 }
 // Fit weights
 printf("Running GD/SGD...\n");
 test_start = clock();
 dgd( m, n, X, y, coef, &intercept, eta, max_iter, 1 );
 test_end = clock();
 test_time = (double)(test_end - test_start) / CLOCKS_PER_SEC;
 printf("Time taken: %f\n", test_time);
 // Compute y_hat and share of observations predicted as class 1
 printf("Making predictions...\n");
 for ( unsigned long i = 0; i < m; i++ )
 {
 y_preds[i] = dsigmoid( n, alpha, &X[i*n], coef, beta, intercept );
 }
 printf("Printing results...\n");
 for ( unsigned long i = 0; i < m; i++ )
 {
 //printf("%f\n", y_pred[i]);
 if (y_preds[i] > 0.5)
 {
 pct_class_1 += 1.0;
 }
 
 // Troubleshooting print
 if (i < 10 || i > m - 10)
 {
 printf("%g\n", y_preds[i]);
 }
 
 }
 /*
 // Troubleshooting print
 printf("Intercept: %f\n", intercept);
 printf("Coefficients:\n");
 for ( unsigned long i = 0; i < n; i++ )
 {
 printf("%f\n", coef[i]);
 }
 */
 printf("Total observations: %ld\n", m);
 printf("Percent class 1: %f\n", pct_class_1 / (double)m);
 return 0;
}
pacmaninbw
26.2k13 gold badges47 silver badges113 bronze badges
asked Feb 2, 2023 at 15:48
\$\endgroup\$
8
  • 2
    \$\begingroup\$ Do you have some sufficiently-representative test data? I could just stick something arbitrary in there, but if you have it, that would be nice. Also, what sorts of systems are the target? \$\endgroup\$ Commented Feb 2, 2023 at 16:14
  • \$\begingroup\$ BLAS has ports for quite a lot of systems. Do you have any specific cases where you know it cannot be used? \$\endgroup\$ Commented Feb 2, 2023 at 16:28
  • \$\begingroup\$ @harold, sorry, I should have included a test script. In an hour I’ll upload the code which allows you to specify number of observations and features (though all but 2 columns are randomized). \$\endgroup\$ Commented Feb 2, 2023 at 16:56
  • \$\begingroup\$ @Reinderien I’m operating under the assumption that another user might not even know what BLAS is. I use BLIS, which I believe installs custom kernels depending on processor architecture. I assumed a program compiled this way would run into issues on different architectures, or is that wrong? \$\endgroup\$ Commented Feb 2, 2023 at 17:00
  • 2
    \$\begingroup\$ Please do not edit the question, especially the code, after an answer has been posted. Changing the question may cause answer invalidation. Everyone needs to be able to see what the reviewer was referring to. What to do after the question has been answered. \$\endgroup\$ Commented Feb 2, 2023 at 22:18

2 Answers 2

3
\$\begingroup\$

how I can make it faster (?)

At this time, I see no big O() improvements.

2x faster?: Use float

If the range and precision requirements are not so high, many implementations have 2x (or more) fast float functions.

This inner computation loops would benefit most with this change.

e.g.

float z; // float type
return expf(z) / (1.0f + expf(z)); // float constant, function

Of course this is highly dependent on OP's compiler and machine.

Modest: Moved repeated test

Test fit_intercept == 1 once.

 if (fit_intercept == 1) { 
 for (size_t i = 0; i < m; i++ ) {
 y_pred[i] = dsigmoid( n, alpha, &X[n*i], coef, beta, *intercept );
 resid[i] = y[i] - y_pred[i];
 for (size_t j = 0; j < n; j++ ) {
 gradient_coef[j] -= (X[n*i + j] * resid[i]);
 }
 gradient_intercept -= resid[i];
 }
 } else {
 for (size_t i = 0; i < m; i++ ) {
 y_pred[i] = dsigmoid( n, alpha, &X[n*i], coef, beta, *intercept );
 resid[i] = y[i] - y_pred[i];
 for (size_t j = 0; j < n; j++ ) {
 gradient_coef[j] -= (X[n*i + j] * resid[i]);
 }
 }
 }

Minor: Use the right size type for array indexing

size_t is the right size for all array indexing. unsigned long may be too big (or possibly too small).

// double dlc( unsigned long n, double *X, double *coef, double intercept )
double dlc(size_t n, double *X, double *coef, double intercept)

Same for dsigmoid(), dgd().

Minor: Use const for unchanging referenced data

Performance: Lessor compilers will benefit. Good compiler will already know referenced data is not changed.

Functionality: by using const, code may be called with const arrays.

// double dlc( unsigned long n, double *X, double *coef, double intercept )
// double dlc(unsigned long n, const double *X, const double *coef, double intercept)

Same for dsigmoid(), dgd().


Tip

Avoid allocate errors (none were noted here). Allocate to the size of the referenced object and avoid getting the type wrong.

// y_pred = (double *) malloc (m * sizeof(double));
// Correct even if `y_red` later changed to a `float *`.
y_pred = malloc (sizeof y_pred[0] * m); 
answered Feb 2, 2023 at 21:52
\$\endgroup\$
2
  • \$\begingroup\$ Thanks for this, I think I've made those adjustments. I also made some additional changes to my code on my own so I'm about to update to reflect. Your suggestions were still relevant! \$\endgroup\$ Commented Feb 2, 2023 at 22:03
  • 1
    \$\begingroup\$ @hillard28 I added the float idea. Hope it works well for you. \$\endgroup\$ Commented Feb 3, 2023 at 19:31
3
\$\begingroup\$

This code doesn't need the casts (as malloc() returns a pointer to void, which is directly convertible to any object pointer):

 double *y_pred = (double *) malloc (m * sizeof(double));
 double *resid = (double *) malloc (m * sizeof(double));
 double *gradient_coef = (double *) malloc (n * sizeof(double));

We also run into Undefined behaviour when allocation fails and malloc() therefore returns a null pointer, since we dereference it without checking.

I'd replace with

 double *const y_pred = malloc(m * sizeof *y_pred);
 double *const resid = malloc(m * sizeof *resid);
 double *const gradient_coef = malloc(n * sizeof *gradient_coef);
 if (!y_pred || !resid || !gradient_coef) {
 free(y_pred);
 free(resid);
 free(gradient_coef);
 return -1; /* or whatever error indication you prefer /*
 }
answered Feb 2, 2023 at 16:18
\$\endgroup\$

Your Answer

Draft saved
Draft discarded

Sign up or log in

Sign up using Google
Sign up using Email and Password

Post as a guest

Required, but never shown

Post as a guest

Required, but never shown

By clicking "Post Your Answer", you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.