10
\$\begingroup\$

I understand that C++ STL template sort runs faster than qsort in C because qsort needs to call the comparator multiple times and C++ makes the code specific to that type. However, C's pre-processor can hack that, too. So I build a test using qsort.

#include <stdlib.h> /* EXIT */
#include <stdio.h> /* printf fputc stdout sprintf */
#include <string.h> /* strcmp */
#include <errno.h> /* errno */
#include <time.h> /* clock */
#include <ctype.h> /* toupper */
#include <math.h> /* pow */
/* Linked with a qsort called qsort_ that acts as a control to test qsort.
 With C89, no inline, one has to optimise to get anything. */
/*#define QSORT_*/
static int int_cmp(const int *const a, const int *const b) {
 return (*a > *b) - (*b > *a);
}
#define SORT_NAME Int
#define SORT_TYPE int
#define SORT_COMPARATOR &int_cmp
#include "Sort.h"
static int int_void_cmp(const void *a, const void *b) {
 return int_cmp(a, b);
}
static void int_fill(int *const a) {
 *a = rand();
}
struct Foo {
 int no;
 char name[64];
};
static const size_t foo_name_size = sizeof ((struct Foo *)0)->name;
static int foo_no_cmp(const struct Foo *const a, const struct Foo *const b) {
 return (a->no > b->no) - (b->no > a->no);
}
#define SORT_NAME FooNo
#define SORT_TYPE struct Foo
#define SORT_COMPARATOR &foo_no_cmp
#include "Sort.h"
static int foo_name_cmp(const struct Foo *const a, const struct Foo *const b) {
 return strcmp(a->name, b->name);
}
#define SORT_NAME FooName
#define SORT_TYPE struct Foo
#define SORT_COMPARATOR &foo_name_cmp
#include "Sort.h"
static int foo_void_no_cmp(const void *a, const void *b) {
 return foo_no_cmp(a, b);
}
static int foo_void_name_cmp(const void *a, const void *b) {
 return foo_name_cmp(a, b);
}
static void foo_fill(struct Foo *const a) {
 const size_t num_chars = 4 + rand() / (RAND_MAX / (foo_name_size - 5) + 1);
 size_t i;
 int_fill(&a->no);
 for(i = 0; i < num_chars; i++) {
 a->name[i] = (i ? 'a' : 'A') + rand() / (RAND_MAX + 1.0) * 26;
 }
 a->name[i] = '0円';
}
/* The control -- not going to include this. C89 */
#ifdef QSORT_ /* <-- my */
void
qsort_(void *a, size_t n, size_t es, int (*cmp)(const void *, const void *));
#endif /* my --> */
typedef int (*TestFn)(const size_t, double *const);
/* This is messy. */
#define CAT_(x, y) x ## y
#define CAT(x, y) CAT_(x, y)
#define TEST_FN(NAME, TYPE, FILL, TEST) \
static int CAT(NAME, _test)(const size_t size, double *const p_ms_time) {\
 TYPE *a = malloc(sizeof *a * size); \
 size_t i; \
 clock_t run; \
 if(!a) return 0; \
 for(i = 0; i < size; i++) (FILL)(a + i); \
 run = -clock(); \
 (TEST); \
 run += clock(); \
 *p_ms_time = (double)run / (CLOCKS_PER_SEC / 1000.0); \
 free(a); \
 return 1; \
}
TEST_FN(int_q, int, int_fill, qsort(a, size, sizeof *a, &int_void_cmp))
TEST_FN(int_s, int, int_fill, IntSort(a, size))
TEST_FN(foo_no_q, struct Foo, foo_fill, \
 qsort(a, size, sizeof *a, &foo_void_no_cmp))
TEST_FN(foo_no_s, struct Foo, foo_fill, FooNoSort(a, size))
TEST_FN(foo_name_q, struct Foo, foo_fill, \
 qsort(a, size, sizeof *a, &foo_void_name_cmp))
TEST_FN(foo_name_s, struct Foo, foo_fill, FooNameSort(a, size))
#ifdef QSORT_ /* <-- my */
TEST_FN(int_m, int, int_fill, qsort_(a, size, sizeof *a, &int_void_cmp))
TEST_FN(foo_no_m, struct Foo, foo_fill, \
 qsort_(a, size, sizeof *a, &foo_void_no_cmp))
TEST_FN(foo_name_m, struct Foo, foo_fill, \
 qsort_(a, size, sizeof *a, &foo_void_name_cmp))
#endif /* my --> */
static const char *const gnu_name = "sort.gnu";
static const char *const graph_name = "sort.eps";
static const unsigned replicas = 5;
static const size_t max_elements = 100000;
int main(void) {
 struct TimeData { const char *const name; TestFn fn; } time_data[] = {
 { "qsort int", &int_q_test },
 { "IntSort", &int_s_test },
 { "qsort foo::no", &foo_no_q_test },
 { "FooNoSort", &foo_no_s_test },
 { "qsort foo::name", &foo_name_q_test },
 { "FooNameSort", &foo_name_s_test }
#ifdef QSORT_ /* <-- my */
 ,
 { "my qsort_ int", &int_m_test },
 { "my qsort_ foo::no", &foo_no_m_test },
 { "my qsort_ foo::name", &foo_name_m_test }
#endif /* my --> */
 }, *td;
 const size_t time_data_size = sizeof time_data / sizeof *time_data;
 unsigned td_i;
 FILE *fp = 0, *gnu = 0;
 size_t elements, x, r;
 char fn_name[32] = "not a file";
 unsigned seed = (unsigned)clock();
 const char *err = 0;
 srand(seed), rand(), printf("Seed %u.\n", seed);
 do { /* Try. */
 if(!(gnu = fopen(gnu_name, "w"))) { err = gnu_name; break; }
 fprintf(gnu, "set term postscript eps enhanced color\n"
 "set output \"%s\"\n"
 "set grid\n"
 "set xlabel \"array elements\"\n"
 "set ylabel \"processor time, t (ms)\"\n"
 "set yrange [0:]\n"
 "# seed %u\n"
 "\n"
 "plot", graph_name, seed);
 for(td_i = 0; td = time_data + td_i, td_i < time_data_size; td_i++) {
 /* Open <sort>.tsv for writing. */
 if(sprintf(fn_name, "%s.tsv", td->name) < 0
 || !(fp = fopen(fn_name, "w"))) { err = fn_name; break; }
 fprintf(stderr, "Created/overwrote, \"%s,\" to store data on %s "
 "sort.\n", fn_name, td->name);
 /* Do several experiments, increasing number of elements. */
 fprintf(fp, "# %s: elements ms\n", td->name);
 for(elements = 1, x = 2; elements <= max_elements;
 elements = (unsigned)pow((double)x++, 3.5)) {
 int n = 0;
 double dt_ms = 0.0, mean_ms = 0.0, delta_ms, ssdm = 0.0;
 fprintf(stderr, "%s: sorting %lu elements distributed uniformly"
 " at random.\n", td->name, (unsigned long)elements);
 /* \cite{Welford1962Note} */
 for(r = 0; r < replicas; r++) {
 fprintf(stderr, "#");
 if(!td->fn(elements, &dt_ms)) { err = "experiment"; break; }
 n++;
 delta_ms = dt_ms - mean_ms;
 mean_ms += delta_ms / n;
 ssdm += delta_ms * (dt_ms - mean_ms);
 }
 fprintf(stderr, " done.\n");
 if(r != replicas) break;
 fprintf(fp, "%lu\t%f\t%f\n", (unsigned long)elements, mean_ms,
 sqrt(ssdm / (n - 1)));
 }
 /* close the file */
 if(fclose(fp)) { fp = 0; err = fn_name; break; }
 fp = 0;
 /* write the graph */
 fprintf(gnu,
 "%s\t\"%s\" using 1:2:3 with errorlines lw 3 title \"%s\"",
 td_i ? ", \\\n" : " ", fn_name, td->name);
 }
 fprintf(gnu, "\n");
 } while(0); if(err) perror(err); /* Catch. */
 { /* Finally. */
 if(fp && fclose(fp)) perror(fn_name);
 if(gnu && fclose(gnu)) perror(gnu_name);
 }
 return err ? EXIT_FAILURE : EXIT_SUCCESS;
}

This is code taken from Bentley McIlroy 1993 and Berkeley; I've modified it to take parameters in the form of pre-processor #defines. It must be run with optimisations on since I'm using C89: doesn't support inline. I've gotten rid of the warnings, but I'm not sure this compiles to the same code.

/*
 * Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved
 *
 * Copyright (c) 1992, 1993
 * The Regents of the University of California. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 * notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 * notice, this list of conditions and the following disclaimer in the
 * documentation and/or other materials provided with the distribution.
 * 3. All advertising materials mentioning features or use of this software
 * must display the following acknowledgement:
 * This product includes software developed by the University of
 * California, Berkeley and its contributors.
 * 4. Neither the name of the University nor the names of its contributors
 * may be used to endorse or promote products derived from this software
 * without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 * @(#)qsort.c 8.1 (Berkeley) 6/4/93
 */
/* Modified by Neil to make it parameterised.
 The parameters are preprocessor macros and all are all undefined at the end of
 the file for convenience.
 @param SORT_NAME
 A unique name associated with {<S>} that satisfies {C} naming conventions when
 mangled.
 @param SORT_TYPE
 This type becomes {<S>}; required.
 @param SORT_COMPARATOR
 A function satisfying {<PS>SortComparator}.
 @std C89
 */
/* Check defines. */
#ifndef SORT_NAME
#error Sort name SORT_NAME undefined.
#endif
#ifndef SORT_TYPE
#error Sort type SORT_TYPE undefined.
#endif
#ifndef SORT_COMPARATOR
#error Sort function SORT_COMPARATOR undefined.
#endif
/* Generics using the preprocessor;
 \url{ http://stackoverflow.com/questions/16522341/pseudo-generics-in-c }. */
#ifdef CAT
#undef CAT
#endif
#ifdef CAT_
#undef CAT_
#endif
#ifdef PCAT
#undef PCAT
#endif
#ifdef PCAT_
#undef PCAT_
#endif
#ifdef S
#undef S
#endif
#ifdef S_
#undef S_
#endif
#ifdef PS_
#undef PS_
#endif
#define CAT_(x, y) x ## y
#define CAT(x, y) CAT_(x, y)
#define PCAT_(x, y) x ## _ ## y
#define PCAT(x, y) PCAT_(x, y)
#define S_(thing) CAT(SORT_NAME, thing)
#define PS_(thing) PCAT(sort, PCAT(SORT_NAME, thing)) /* Private. */
/* Troubles with this line? check to ensure that {SORT_TYPE} is a valid type,
 whose definition is placed above {#include "Sort.h"}. */
typedef SORT_TYPE PS_(Type);
#define S PS_(Type)
/** Must establish a partial order amongst {<S>} by returning
 {a < b, negative; a = b, 0; a > b, positive}. */
typedef int (*PS_(SortComparator))(const S *const a, const S *const b);
/* Check that {MAP_COMPARATOR} is a function implementing
 {<PS>SortComparator}. */
static const PS_(SortComparator) PS_(cmp) = (SORT_COMPARATOR);
#include <sys/types.h>
#ifndef SORT_H /* <-- h Idempotent. */
#define SORT_H
#define sort_min(a, b) (a) < (b) ? a : b
enum SortSwapType { WORD, WORDS, BYTE };
#endif /* h --> */
/*
 * Qsort routine from Bentley & McIlroy's "Engineering a Sort Function".
 *
 * (How does that work with the copyright?)
 */
static void
PS_(swapfunc)(S *a, S *b, const size_t n, const enum SortSwapType swaptype)
{
 switch(swaptype) {
 case WORDS:
 case WORD:
 {
 long i = sizeof *a * n / sizeof(long);
 long *pi = (long *)(void *)a;
 long *pj = (long *)(void *)b;
 do {
 long t = *pi;
 *pi++ = *pj;
 *pj++ = t;
 } while (--i > 0);
 }
 break;
 case BYTE:
 {
 long i = sizeof *a * n;
 char *pi = (char *)a;
 char *pj = (char *)b;
 do {
 char t = *pi;
 *pi++ = *pj;
 *pj++ = t;
 } while (--i > 0);
 }
 break;
 }
}
static void
PS_(swap)(S *a, S *b, const enum SortSwapType swaptype)
{
 if(swaptype == WORD) {
 long t = *(long *)(void *)(a);
 *(long *)(void *)(a) = *(long *)(void *)(b);
 *(long *)(void *)(b) = t;
 } else {
 PS_(swapfunc)(a, b, (size_t)1, swaptype);
 }
}
static S *
PS_(med3)(S *a, S *b, S *c)
{
 return PS_(cmp)(a, b) < 0 ?
 (PS_(cmp)(b, c) < 0 ? b : (PS_(cmp)(a, c) < 0 ? c : a ))
 : (PS_(cmp)(b, c) > 0 ? b : (PS_(cmp)(a, c) < 0 ? a : c ));
}
static void
S_(Sort)(S *a, size_t n)
{
 S *pa, *pb, *pc, *pd, *pl, *pm, *pn;
 size_t vec;
 long d;
 enum SortSwapType swaptype;
 int swap_cnt, r;
loop:
 /* Re: (0*n) "warning: use of logical || with constant operand; switch to
 bitwise | or remove constant [-Wconstant-logical-operand]" but sizeof are
 not supported in pre-precessor. :S Wouldn't this be constant? */
 swaptype = ((char *)a - (char *)0) % sizeof(long)
 || sizeof *a % sizeof(long) + (0*n) ? BYTE
 : sizeof *a == sizeof(long) ? WORD : WORDS;
 swap_cnt = 0;
 if (n < 7) {
 for (pm = a + 1; pm < a + n; pm++)
 for (pl = pm; pl > a && PS_(cmp)(pl - 1, pl) > 0; pl--)
 PS_(swap)(pl, pl - 1, swaptype);
 return;
 }
 pm = a + (n / 2);
 if (n > 7) {
 pl = a;
 pn = a + (n - 1);
 if (n > 40) {
 d = n / 8;
 pl = PS_(med3)(pl, pl + d, pl + 2 * d);
 pm = PS_(med3)(pm - d, pm, pm + d);
 pn = PS_(med3)(pn - 2 * d, pn - d, pn);
 }
 pm = PS_(med3)(pl, pm, pn);
 }
 PS_(swap)(a, pm, swaptype);
 pa = pb = a + 1;
 pc = pd = a + (n - 1);
 for (;;) {
 while (pb <= pc && (r = PS_(cmp)(pb, a)) <= 0) {
 if (r == 0) {
 swap_cnt = 1;
 PS_(swap)(pa, pb, swaptype);
 pa++;
 }
 pb++;
 }
 while (pb <= pc && (r = PS_(cmp)(pc, a)) >= 0) {
 if (r == 0) {
 swap_cnt = 1;
 PS_(swap)(pc, pd, swaptype);
 pd--;
 }
 pc--;
 }
 if (pb > pc)
 break;
 PS_(swap)(pb, pc, swaptype);
 swap_cnt = 1;
 pb++;
 pc--;
 }
 if (swap_cnt == 0) { /* Switch to insertion sort */
 for (pm = a + 1; pm < a + n; pm++)
 for (pl = pm; pl > a && PS_(cmp)(pl - 1, pl) > 0; pl--)
 PS_(swap)(pl, pl - 1, swaptype);
 return;
 }
 pn = a + n;
 vec = sort_min(pa - a, pb - pa);
 if(vec > 0) PS_(swapfunc)(a, pb - vec, vec, swaptype);
 vec = sort_min((size_t)(pd - pc), (size_t)(pn - pd - 1));
 if(vec > 0) PS_(swapfunc)(pb, pn - vec, vec, swaptype);
 if ((size_t)(vec = pb - pa) > 1)
 S_(Sort)(a, vec);
 if ((vec = pd - pc) > 1) {
 /* Iterate rather than recurse to save stack space */
 a = pn - vec;
 n = vec;
 goto loop;
 }
}
/* Un-define all macros. Undocumented; allows nestled inclusion so long as:
 {CAT_}, {CAT}, {PCAT}, {PCAT_} conform, and {S}, is not used. */
#ifdef SORT_SUBTYPE /* <-- sub */
#undef SORT_SUBTYPE
#else /* sub --><-- !sub */
#undef CAT
#undef CAT_
#undef PCAT
#undef PCAT_
#endif /* !sub --> */
#undef S
#undef S_
#undef PS_
#undef SORT_NAME
#undef SORT_TYPE
#undef SORT_COMPARATOR

On my computer, it outputs, with optimisations on, via gnuplot. It is faster, but not by much.

As expected, parameterised sort is better then a general qsort.

I'm not sure about the changes I've made to qsort.c. Specifically, why swaptype = (a-(char*)0 | es) % W ? 2 : es > W ? 1 : 0 (Bentley1993,) where es is the size. Why is it called every time? Shouldn't it be a | es without the subtracting null? Isn't a aligned properly and it might be es and taken out of the loop?

asked Apr 6, 2019 at 1:38
\$\endgroup\$
4
  • 1
    \$\begingroup\$ you might be interested in Perl's qsort which switches to insertion sort on small partitions. \$\endgroup\$ Commented Apr 6, 2019 at 2:28
  • \$\begingroup\$ I think that they were heavily influenced by the same paper. \$\endgroup\$ Commented Apr 6, 2019 at 2:42
  • 2
    \$\begingroup\$ It would be interesting to see where std::sort places on that graph. \$\endgroup\$ Commented Apr 6, 2019 at 4:06
  • 1
    \$\begingroup\$ @OhMyGoodness it doesn't involve a qsort, but if researching hybrid sorting routines, Python's Timsort probably warrants a look. \$\endgroup\$ Commented Sep 20 at 16:09

1 Answer 1

3
\$\begingroup\$

It took quite some time for me to understand this comment:

/*
 The parameters are preprocessor macros and all are all undefined at the end of
 the file for convenience.
 */

Part of the problem is the stray "all" in there. It's not relevant to users where in the header file they get undefined, so I would just say something like "The parameters are preprocessor macros and will become undefined once this header is included".


This pattern is unnecessary:

#ifdef CAT
#undef CAT
#endif

It's perfectly valid (and idiomatic) to undef macros that have never been defined.


<sys/types.h> should not be needed. There's no reason for this code not to be portable.


swapfunc is missing an optimisation opportunity. We care most about whether the actual size to be swapped is a multiple of the size of long, which happens when sizeof *a * n % sizeof (long) is zero. We can end up using the byte-by-byte approach when the size of *a isn't a multiple of the size of long, but multiplying by n makes it so.

In any case, this is probably better abstracted as a function void mem_swap(void *a, void *b, size_t count) along the lines of memcpy() or memmove() and open to the same kinds of optimisation used there (e.g. using more registers and/or SIMD instructions where available).


Some signed/unsigned conversions can be eliminated for safer code. E.g.

 /*long*/ size_t i = sizeof *a * n / sizeof(long);
 /*long*/ size_t i = sizeof *a * n;
 /*long*/ size_t d = n / 8;

(char *)a - (char *)0 is UB, because these are not pointers into the same object. It appears that this is a test of alignment, but it's not a portable way to go about it. In modern C, we should convert a to uintptr_t instead.


I think most C programmers would prefer for (;;) with suitable break rather than goto loop.


I don't completely follow the implementation - the commentary is very sparse. I think we choose the smaller partition for the recursive call and the larger for the iteration, but it's hard to be sure I've read that correctly.


The benchmark program would be better if it were deterministically reproducible. So instead of using clock() as input to srand() prefer a fixed seed so that every execution has the same input.

answered Sep 20 at 15:43
\$\endgroup\$

Your Answer

Draft saved
Draft discarded

Sign up or log in

Sign up using Google
Sign up using Email and Password

Post as a guest

Required, but never shown

Post as a guest

Required, but never shown

By clicking "Post Your Answer", you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.