My code seems to work fine, but I'm interested in how I can make it faster. I really just want to reduce any wasteful computation (for instance eta_m = eta / (double)m
reduces coef[j] -= eta * gradient_coef[j] / (double)m
to coef[j] -= eta_m * gradient_coef[j]
). I realize a good BLAS implementation would probably be useful, but I've found that complicates usage on other systems (unless there's some generalized implementation out there).
I've tried to take advantage of some linear properties (where able) to reduce the number of loops performed, though the initial loop that sets coefficients and gradients to 0.0 could probably be wrapped into the main loops using if statements. The alpha and beta declarations are vestigial from using BLAS but can't imagine they're very impactful. Anything glaringly inefficient that I'm missing here?
#include <stdlib.h>
#include <math.h>
// Compute z = w * x + b
double dlc( unsigned long n, double *X, double *coef, double intercept )
{
double z = intercept;
for ( unsigned long j = 0; j < n; j++ )
{
z += X[j] * coef[j];
}
return z;
}
// Compute y_hat = 1 / (1 + e^(-z))
double dsigmoid( unsigned long n, double alpha, double *X, double *coef, double beta, double intercept )
{
//double z = intercept;
//bli_ddotxv( BLIS_NO_CONJUGATE, BLIS_NO_CONJUGATE, n, &alpha, X, 1, coef, 1, &beta, &z );
double z = dlc(n, X, coef, intercept);
if ( z >= 0)
{
return 1.0 / (1.0 + exp(-z));
}
else
{
return exp(z) / (1.0 + exp(z));
}
}
// Gradient descent
void dgd( unsigned long m, unsigned long n, double *X, double *y, double *coef, double *intercept, double eta, int max_iter, int fit_intercept )
{
double alpha = 1.0, beta = 1.0, gradient_intercept = 0.0, eta_m;
double *y_pred = (double *) malloc (m * sizeof(double));
double *resid = (double *) malloc (m * sizeof(double));
double *gradient_coef = (double *) malloc (n * sizeof(double));
for ( unsigned long j = 0; j < n; j++ )
{
coef[j] = 0.0;
gradient_coef[j] = 0.0;
}
*intercept = 0.0;
eta_m = eta / (double)m;
for ( int epoch = 0; epoch < max_iter; epoch++ )
{
// Compute y_hat and gradients
for ( unsigned long i = 0; i < m; i++ )
{
y_pred[i] = dsigmoid( n, alpha, &X[n*i], coef, beta, *intercept );
resid[i] = y[i] - y_pred[i];
for ( unsigned long j = 0; j < n; j++ )
{
gradient_coef[j] -= (X[n*i + j] * resid[i]);
}
if ( fit_intercept == 1 )
{
gradient_intercept -= resid[i];
}
}
// Adjust weights
for ( unsigned long j = 0; j < n; j++ )
{
coef[j] -= eta_m * gradient_coef[j];
gradient_coef[j] = 0.0;
}
if ( fit_intercept == 1 )
{
*intercept -= eta_m * gradient_intercept;
gradient_intercept = 0.0;
}
}
free(y_pred);
free(resid);
free(gradient_coef);
}
Edit: added code for checking speed of execution.
int main(void)
{
double *X, *y, *coef, *y_preds;
double intercept, eta = 0.5;
double alpha = 1.0, beta = 1.0;
unsigned long m = 100000;
unsigned long n = 20;
int max_iter = 250;
int class_0 = (unsigned long)(3.0 / 4.0 * (double)m);
double pct_class_1 = 0.0;
clock_t test_start;
clock_t test_end;
double test_time;
printf("Constructing variables...\n");
X = (double *) malloc (m * n * sizeof(double));
y = (double *) malloc (m * sizeof(double));
y_preds = (double *) malloc (m * sizeof(double));
coef = (double *) malloc (n * sizeof(double));
// Initialize classes
for ( unsigned long i = 0; i < m; i++ )
{
if (i < class_0)
{
y[i] = 0.0;
}
else
{
y[i] = 1.0;
}
/*
// Troubleshooting print
if (i < 10 || i > m - 10)
{
//printf("%f\n", y[i]);
}
*/
}
// Initialize observation features
for ( unsigned long i = 0; i < m; i++ )
{
if (i < class_0)
{
X[n*i] = 1.0 / (double)m;
}
else
{
X[n*i] = (double)i / (double)m;
}
X[n*i + 1] = (double)i / (double)m;
for ( unsigned long j = 2; j < n; j++ )
{
X[n*i + j] = (double)rand() / (double)RAND_MAX;
}
/*
// Troubleshooting print
if (i < 10)
{
//printf("%f\t%f\n", X[n*i], X[n*i+1]);
}
*/
}
// Fit weights
printf("Running GD/SGD...\n");
test_start = clock();
dgd( m, n, X, y, coef, &intercept, eta, max_iter, 1 );
test_end = clock();
test_time = (double)(test_end - test_start) / CLOCKS_PER_SEC;
printf("Time taken: %f\n", test_time);
// Compute y_hat and share of observations predicted as class 1
printf("Making predictions...\n");
for ( unsigned long i = 0; i < m; i++ )
{
y_preds[i] = dsigmoid( n, alpha, &X[i*n], coef, beta, intercept );
}
printf("Printing results...\n");
for ( unsigned long i = 0; i < m; i++ )
{
//printf("%f\n", y_pred[i]);
if (y_preds[i] > 0.5)
{
pct_class_1 += 1.0;
}
// Troubleshooting print
if (i < 10 || i > m - 10)
{
printf("%g\n", y_preds[i]);
}
}
/*
// Troubleshooting print
printf("Intercept: %f\n", intercept);
printf("Coefficients:\n");
for ( unsigned long i = 0; i < n; i++ )
{
printf("%f\n", coef[i]);
}
*/
printf("Total observations: %ld\n", m);
printf("Percent class 1: %f\n", pct_class_1 / (double)m);
return 0;
}
-
2\$\begingroup\$ Do you have some sufficiently-representative test data? I could just stick something arbitrary in there, but if you have it, that would be nice. Also, what sorts of systems are the target? \$\endgroup\$user555045– user5550452023年02月02日 16:14:00 +00:00Commented Feb 2, 2023 at 16:14
-
\$\begingroup\$ BLAS has ports for quite a lot of systems. Do you have any specific cases where you know it cannot be used? \$\endgroup\$Reinderien– Reinderien2023年02月02日 16:28:25 +00:00Commented Feb 2, 2023 at 16:28
-
\$\begingroup\$ @harold, sorry, I should have included a test script. In an hour I’ll upload the code which allows you to specify number of observations and features (though all but 2 columns are randomized). \$\endgroup\$hillard28– hillard282023年02月02日 16:56:49 +00:00Commented Feb 2, 2023 at 16:56
-
\$\begingroup\$ @Reinderien I’m operating under the assumption that another user might not even know what BLAS is. I use BLIS, which I believe installs custom kernels depending on processor architecture. I assumed a program compiled this way would run into issues on different architectures, or is that wrong? \$\endgroup\$hillard28– hillard282023年02月02日 17:00:17 +00:00Commented Feb 2, 2023 at 17:00
-
2\$\begingroup\$ Please do not edit the question, especially the code, after an answer has been posted. Changing the question may cause answer invalidation. Everyone needs to be able to see what the reviewer was referring to. What to do after the question has been answered. \$\endgroup\$pacmaninbw– pacmaninbw ♦2023年02月02日 22:18:06 +00:00Commented Feb 2, 2023 at 22:18
2 Answers 2
how I can make it faster (?)
At this time, I see no big O() improvements.
2x faster?: Use float
If the range and precision requirements are not so high, many implementations have 2x (or more) fast float
functions.
This inner computation loops would benefit most with this change.
e.g.
float z; // float type
return expf(z) / (1.0f + expf(z)); // float constant, function
Of course this is highly dependent on OP's compiler and machine.
Modest: Moved repeated test
Test fit_intercept == 1
once.
if (fit_intercept == 1) {
for (size_t i = 0; i < m; i++ ) {
y_pred[i] = dsigmoid( n, alpha, &X[n*i], coef, beta, *intercept );
resid[i] = y[i] - y_pred[i];
for (size_t j = 0; j < n; j++ ) {
gradient_coef[j] -= (X[n*i + j] * resid[i]);
}
gradient_intercept -= resid[i];
}
} else {
for (size_t i = 0; i < m; i++ ) {
y_pred[i] = dsigmoid( n, alpha, &X[n*i], coef, beta, *intercept );
resid[i] = y[i] - y_pred[i];
for (size_t j = 0; j < n; j++ ) {
gradient_coef[j] -= (X[n*i + j] * resid[i]);
}
}
}
Minor: Use the right size type for array indexing
size_t
is the right size for all array indexing. unsigned long
may be too big (or possibly too small).
// double dlc( unsigned long n, double *X, double *coef, double intercept )
double dlc(size_t n, double *X, double *coef, double intercept)
Same for dsigmoid()
, dgd()
.
Minor: Use const
for unchanging referenced data
Performance: Lessor compilers will benefit. Good compiler will already know referenced data is not changed.
Functionality: by using const
, code may be called with const
arrays.
// double dlc( unsigned long n, double *X, double *coef, double intercept )
// double dlc(unsigned long n, const double *X, const double *coef, double intercept)
Same for dsigmoid()
, dgd()
.
Tip
Avoid allocate errors (none were noted here). Allocate to the size of the referenced object and avoid getting the type wrong.
// y_pred = (double *) malloc (m * sizeof(double));
// Correct even if `y_red` later changed to a `float *`.
y_pred = malloc (sizeof y_pred[0] * m);
-
\$\begingroup\$ Thanks for this, I think I've made those adjustments. I also made some additional changes to my code on my own so I'm about to update to reflect. Your suggestions were still relevant! \$\endgroup\$hillard28– hillard282023年02月02日 22:03:45 +00:00Commented Feb 2, 2023 at 22:03
-
1\$\begingroup\$ @hillard28 I added the
float
idea. Hope it works well for you. \$\endgroup\$chux– chux2023年02月03日 19:31:36 +00:00Commented Feb 3, 2023 at 19:31
This code doesn't need the casts (as malloc()
returns a pointer to void, which is directly convertible to any object pointer):
double *y_pred = (double *) malloc (m * sizeof(double)); double *resid = (double *) malloc (m * sizeof(double)); double *gradient_coef = (double *) malloc (n * sizeof(double));
We also run into Undefined behaviour when allocation fails and malloc()
therefore returns a null pointer, since we dereference it without checking.
I'd replace with
double *const y_pred = malloc(m * sizeof *y_pred);
double *const resid = malloc(m * sizeof *resid);
double *const gradient_coef = malloc(n * sizeof *gradient_coef);
if (!y_pred || !resid || !gradient_coef) {
free(y_pred);
free(resid);
free(gradient_coef);
return -1; /* or whatever error indication you prefer /*
}