Max-heap implementation in C

Question 1

I have tried to implement my Heap in C. The following are the 13 operations defined:

build_maxheap
insert
exctract_max (delete heap max root)
max_delete (delete an element or key)
max_heapify
clear
heapSort
get_max
print
increase_key (helper function for insert and delete key functions)
height
is_empty
is_maxheap (checks if the array is a heap)

This is the code in C:

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>
typedef struct MaxHeap {
 int size;
 int* heap;
} MaxHeap;
const MaxHeap maxheap_init = { .size = 0, .heap = NULL };
int INF = 1000, N_INF = -1000;
void swap(int* a, int* b) {
 int temp = *a;
 *a = *b;
 *b = temp;
}
int parent(int i) { 
 return (i-1)/2; 
}
 
int left(int i) { 
 return (2*i + 1); 
}
 
int right(int i) { 
 return (2*i + 2);
}
int get_max(MaxHeap* max_heap) { //O(1)
 return max_heap->heap[0];
}
int is_empty(MaxHeap* max_heap) {
 return max_heap->heap[0] == N_INF;
}
int height(MaxHeap* max_heap) {
 return floor(log2(max_heap->size));
}
MaxHeap* create_maxheap(int size) {
 MaxHeap* heap = calloc(size, sizeof * heap);
 if (!heap) return heap;
 heap->size = size;
 return heap;
}
void max_heapify(int* data, int* key_index, int i, int size) { // O(logn)
 int largest = i, leftie = left(i), rightie = right(i);
 
 if (leftie < size && data[leftie] > data[largest])
 largest = leftie;
 if (rightie < size && data[rightie] > data[largest])
 largest = rightie;//transitvity 
 if (largest != i) {
 swap(&data[i], &data[largest]);
 //
 key_index[data[i]] = i;
 key_index[data[largest]] = largest;
 //
 max_heapify(data, key_index, largest, size);
 }
}
void increase_key(MaxHeap* max_heap, int* key_index, int j, int key) {
 if (key < max_heap->heap[j])
 printf("Error: key must be larger \n");
 else {
 max_heap->heap[j] = key;
 key_index[key] = j;
 for (int i = j; i > 0; i = parent(i)) {
 if (max_heap->heap[i] > max_heap->heap[parent(i)]) {
 swap(&max_heap->heap[i], &max_heap->heap[parent(i)]);
 key_index[max_heap->heap[i]] = i;
 key_index[max_heap->heap[parent(i)]] = parent(i);
 }
 else break;
 }
 }
}
int* build_max_heap(int* data, int* key_index, int size) { //O(n)
 for (int i = size / 2 - 1; i >= 0; i--) {
 max_heapify(data, key_index, i, size);
 }
 return data;
}
void insert(MaxHeap* max_heap, int* key_index, int key) { // O(logn)
 int* temp = realloc (max_heap->heap, (max_heap->size + 1) * sizeof (*(max_heap->heap)));
 if (temp) {
 max_heap->heap = temp;
 max_heap->heap[max_heap->size] = N_INF;
 increase_key(max_heap, key_index, max_heap->size, key);
 max_heap->size += 1;
 temp = 0;
 }
}
void extract_max(MaxHeap* max_heap, int* key_index) { // O(logn)
 swap(&max_heap->heap[0], &max_heap->heap[max_heap->size - 1]);
 max_heap->size -= 1;
 max_heapify(max_heap->heap, key_index, 0, max_heap->size);
}
void delete_key(MaxHeap* max_heap, int* key_index, int key) { //O(logn)
 int index = key_index[key];
 increase_key(max_heap, key_index, index, INF);
 extract_max(max_heap, key_index);
}
void clear(MaxHeap* max_heap) {
 for (int i = 0; i < max_heap->size; i++)
 max_heap->heap[i] = N_INF; //assuming it is never in the heap
 max_heap->size = 0;
}
void print_heap(MaxHeap* max_heap) { // O(n)
 int size = max_heap->size;
 if (size % 2 != 0) 
 size = size + 1;
 for (int i = 0; i < size/2 - 1; i++) {
 printf("Parent: %d -> Left: %d | Right: %d \n", max_heap->heap[i], max_heap->heap[2*i+1], max_heap->heap[2*i+2]);
 }
 int j = max_heap->size/2 - 1;
 if (max_heap->size % 2 == 0)
 printf("Parent: %d -> Left: %d \n", max_heap->heap[j], max_heap->heap[2*j+1]);
}
int is_maxheap(MaxHeap* max_heap) { // O(n)
 for (int i = max_heap->size-1; i > 0 ; i--) {
 if (max_heap->heap[i] > max_heap->heap[parent(i)]) {
 return 0;
 }
 }
 return 1;
}
void heap_sort(MaxHeap* max_heap, int* key_index) { // assumes already a max heap as argument
 // if the input is not a heap, call build_max_heap()
 for (int i = max_heap->size - 1; i >= 0; i--) {
 swap(&max_heap->heap[0], &max_heap->heap[i]);
 max_heapify(max_heap->heap, key_index, 0, i);
 }
}
void print_arr(MaxHeap* max_heap) {
 for (int i = 0; i < max_heap->size; i++)
 printf("%d ", max_heap->heap[i]);
 printf("\n");
}
int main() {
 
 int size = 6, MAX_Key = INF, elements[6] = {10, 20, 15, 12, 40, 25};
 int* data = calloc(size, sizeof * data);
 if (!data) return 0;
 for (int i = 0; i < size; i++)
 data[i] = elements[i];
 int* key_index = calloc(MAX_Key, sizeof * key_index); // we assume key > 0, if key < 0, then we add an additional array
 if (!key_index) return 0;
 for (int i = 0; i < size; i++)
 key_index[elements[i]] = i;
 
 MaxHeap max_heap = maxheap_init;
 max_heap.size = size;
 max_heap.heap = build_max_heap(data, key_index, size);
 
 print_heap(&max_heap);
 printf("is_max_heap: %d \n", is_maxheap(&max_heap));
 extract_max(&max_heap, key_index);
 printf("is_max_heap: %d \n", is_maxheap(&max_heap));
 print_heap(&max_heap);
 insert(&max_heap, key_index, 14);
 printf("is_max_heap: %d \n", is_maxheap(&max_heap));
 print_heap(&max_heap);
 delete_key(&max_heap, key_index, 14);
 printf("is_max_heap: %d \n", is_maxheap(&max_heap));
 print_heap(&max_heap);
 printf("height: %d \n", height(&max_heap));
 printf("max: %d \n", get_max(&max_heap));
 heap_sort(&max_heap, key_index);
 print_arr(&max_heap);
 clear(&max_heap);
 printf("is empty: %d \n", is_empty(&max_heap));
 
 free(data);
 free(key_index);
 
 return 0;
}

If you have any improvement ideas, I would be very grateful to read through them. I should note that this implementation is for learning purposes, not for a client or something. There might be some place when I missed some secondary check for an empty array or something, but I am interested in seeing if the code holds or not. I tested it and it works just fine.

Question 2

This is a max-heap datastructure, and there is very consistent nomenclature here which reflects that. I'm sure it is clear and commendable. But I wouldn't have minded seeing it written in a comment, once, and then we just call it a heap in the code. (Or perhaps there's some design document, not in the submission, that talks about future plans for a min-heap.)

Consider adopting a code style (maybe GNU? or Google?) and enforcing it with a linter or pretty printer. There are well-known issues that stem from neglecting to adorn single-line if / for statements with curly braces. Type a couple of "extra" { } characters -- it will be worth it in the end!

I guess this looks mostly plausible:

int height(MaxHeap* max_heap) {
 return floor(log2(max_heap->size));
}

It's not obvious to me that most callers would expect a floor result instead of a ceil result. But there's no comment documentation specifying its behavior, and we can only report a bug w.r.t. some written spec, so this code's behavior must be correct. We're not restricted to power-of-two sizes, and it's nice that that is pretty clear.

There is no warning that caller is responsible for verifying non-empty prior to calling this, for example we see that maxheap_init is valid but empty. I can't say that I feel good about returning negative HUGE_VAL in the empty case, since 0 seems the most natural response for that.

This submission includes a demo but no unit tests. In particular, we see no self-evaluating tests that would offer concrete example return values for particular heaps. Adding such test(s) would bring documentation value.

max_heapify's recursive call to itself is Fine.

If this was implemented in scheme, it's a sure thing the tail recursion would be optimized, with no extra stack usage.

This submission does not include a Makefile or build script that shows which compiler is used with which compilation flags.

For some compiler target of interest, it would be useful for a comment to mention whether recursion costs us stack space here or not. A caller might know that it is tight on stack space and would want to know if it is safe to heapify. A https://godbolt.org link could suffice.

Alternatively, you might prefer to code a while loop here.

insert should probably report fatal error if caller violates this constraint:

N_INF < key < INF

Given the void signature, I can't say I get warm fuzzies about how a practical application would behave once insert exhausts memory. At a minimum we need some comments describing caller's responsibility for checking errno.

 temp = 0;

What's that all about? Leftover debug? This isn't java, it's not like nulling out the pointer will enable some garbage collection.

Making extract_max void seems odd. Yes, caller could grab the value prior to the call. But it seems natural to return the extracted value, no?

Or maybe this should have been a private helper for delete_key ?

In clear, "assume" seems like the wrong verb.

 max_heap->heap[i] = N_INF; //assuming it is never in the heap

As touched on above, restricted range of heap values is a fundamental invariant of this datastructure. We "enforce" what range those values are in. Prefer a verb like "shall never be in the heap".

print_heap does this:

 ... max_heap->heap[i], max_heap->heap[2*i+1], max_heap->heap[2*i+2]);

Rather than those expressions, prefer the left / right helpers.

Also, is there an opportunity to use the parent helper, here?

The is_maxheap identifier is suggestive of Boolean, very nice, thank you. But couldn't we offer a return type in the signature that is narrower than signed integer?

print_arr does this:

 printf("%d ", max_heap->heap[i]);

Given the limited range of heap values, it might be convenient to impose a fixed number of display columns for each decimal value. Then they'd have a better chance of lining up when wrapped at the terminal's line length.

main tidies up after itself:

 free(data);
 free(key_index);

I am looking at the allocation in create_maxheap. But we never called that, preferring build_max_heap instead. Maybe delete some vestigial code?

main correctly verifies allocations, and silently drops out upon failure. Not the friendliest behavior -- it is likely to increase the cost of supporting this code in the field when problem reports hit the helpdesk. Consider linking against allocation routines that are a bit more on the chatty side when things go south.

Overall?

This looks fairly solid. I would be willing to delegate or accept maintenance tasks for this code base. Ship it!

Question 3

Thanks very much, and I even learnt some English along the way ! You mean I don t need to free in main?

Question 4

I think free'ing in main is an excellent habit, which I encourage. Maybe someone links another app against this, or renames it, or copy-n-pastes code -- the tidying up is entirely appropriate. I was just doing an inventory: Let's see, alloc/free, alloc/free, alloc, wait, what happened there? Oh, yeah! Never called, so not an issue. And then I wondered if you maybe want to delete a function that used to be called but no longer is.

Question 5

One suggestion, dumb down the answers a little bit. Quite a number of our users are English as a second language. A number are still in high school if they are native English speakers. I do understand everything you write but I speak American English and I went to a university to study computer science. Many of our users are hobbyists. Plus 1.

Question 6

Please don't dumb down the answers :-)

Question 7

General Observations

Since this might be a set of library routines, it might be a good idea to move all the heap related functions to their own .c file and have a header file (.h) with function prototypes and struct declarations. Functions such as swap(), parent(), left() and right() should be contained within the heap .c and declared as static so that they are not global functions(). This will allow other data structures to use similar function names if necessary.

Contain Logic Blocks in Curly Braces

In the review by J_H it is mentioned that all then and else clauses of an if statement as well as all loops should be contained within curly braces ({ and }). This is actually a best practice for C and C++. All the companies that I have worked for have had coding standards that required this.

The reason that this is a best practice is that one of the most common errors in code maintenance is the failure to create a logic block when a statement is added to an if statement. It is better to make maintenance coding easier by putting all code into logic blocks to begin with.

Use Stronger Compiler Warnings

As J_H mentioned, we don't know what compiler you are using, but if you are using gcc you should add some additional warning flags to improve your code. These are the flags I would recommend:

-pendantic
-Wall
Wextra
-Wconversion

These flags found the following warnings in the code:

In function 'height':
48:12: warning: conversion from 'double' to 'int' may change value [-Wfloat-conversion] return floor(log2(max_heap->size));

In function 'create_maxheap':
52:28: warning: conversion to 'size_t' {aka 'long unsigned int'} from 'int' may change the sign of the result [-Wsign-conversion]

52 | MaxHeap* heap = calloc(size, sizeof * heap);
In function 'insert':
100:63: warning: conversion to 'long unsigned int' from 'int' may change the sign of the result [-Wsign-conversion]
100 | int* temp = realloc (max_heap->heap, (max_heap->size + 1) * sizeof (*(max_heap->heap)));

In function 'main':
166:24: warning: conversion to 'size_t' {aka 'long unsigned int'} from 'int' may change the sign of the result [-Wsign-conversion]
166 | int* data = calloc(size, sizeof * data);

170:29: warning: conversion to 'size_t' {aka 'long unsigned int'} from 'int' may change the sign of the result [-Wsign-conversion]
170 | int* key_index = calloc(MAX_Key, sizeof * key_index); // we assume key > 0, if key < 0, then we add an additional array

Prefer `size_t` Over `int` for Size Variables

There is a special unsigned integer type used for the size of allocated memory and indexes in arrays. The C operator sizeof() returns the type size_t. The implementation of size_t varies by system but it always stores the maximum size of an array for a system. It is used for indexing arrays because it can't become negative, negative indexes can cause Undefined Behavior which can cause bugs.

The size_t type is the preferred input for malloc(), calloc() and realloc().

typedef struct MaxHeap {
 size_t size;
 int* heap;
} MaxHeap;

Readability and Maintainability

For readability and maintainability each variable should be declared and initialized on its own line.

Line 19:

int INF = 1000, N_INF = -1000;

Should be:

int INF = 1000;
int N_INF = -1000;

Line 59:

 int largest = i, leftie = left(i), rightie = right(i);

Should be:

 size_t largest = i;
 size_t leftie = left(i);
 size_t rightie = right(i);

In main(), line 167:

 int size = 6, MAX_Key = INF, elements[6] = {10, 20, 15, 12, 40, 25};

should be changed as follows, you don't need the [6] for elements and you can calculate the size:

 int elements[] = {10, 20, 15, 12, 40, 25};
 size_t size = (sizeof(elements)/sizeof(elements[0]));
 size_t MAX_Key = INF;

Question 8

The "operator sizeof()" is actually just sizeof, as I'm sure you know.

Question 9

The biggest issue I see here, and is not yet commented on, is the call to realloc for every insertion. When you remove an element from the heap, the size value is changed but the memory block (array) is not.

So when you remove one element and then insert another, the realloc call does nothing, as the array has the right size. But what happens if you remove two elements, and then insert one? realloc will reduce the array size by 1. For the next insertion the array has to grow again. Hopefully your implementation of realloc will not look for a new memory block here and copy data over...

In any case, increasing the array size by 1 every time you insert an element is expensive, it makes the insertion operation have a cost of O(n) instead of O(log n), because the array needs to be copied over. Typically we double the array size when we run out of space, so that the insertion operation remains of O(log n). You can accomplish this by separating the array size and the number of elements in the heap. This is slightly more bookkeeping, for the benefit of faster insertions, and no need for that marker value (the heap is empty if the "number of elements" variable is 0). For example:

typedef struct MaxHeap {
 int* heap; // pointer to array
 int size; // number of elements stored in array
 int capacity; // size of array
} MaxHeap;

Another issue is that the heap insertion reallocates the array, changing the max_heap.heap pointer, but not the data pointer in main(), which now might point to deallocated memory. At the end of the program, data is freed (a potential exception), but not max_heap.heap (a potential memory leak).

I think it is cleaner if MaxHeap owns the data it points to, and at the end of the program you call some function that frees the data in the heap, say free_maxheap(max_heap). For best effect, you'd actually use create_maxheap(), so that the heap allocates its own data array. You should not have a function like build_max_heap() that takes a pointer from the caller and keeps it. Currently it is unclear to the caller: can I free the pointer after calling build_max_heap()? No, because the pointer is still being used, though this is not documented anywhere. It is much cleaner if the caller passes a pointer to data, but the data is copied over into the heap. If for efficiency you want to have a function that takes ownership of the input pointer, then this needs to be clearly documented, both in comments and in the function name (maybe create_maxheap_with_array()?).

I really don't like the key_index array being a separate data structure that needs to be passed into all functions that work on the heap. It needs to be part of the heap data structure.

It seems that the key_index array is only read in delete_key(), everywhere else it is just updated. This is a lot of bookkeeping for the benefit of being able to delete a specific value from the heap in O(log n). I've never written a heap with this ability, I don't know if I'd do it the same way. If an O(n) delete is OK (you don't delete often), then consider just looking through the heap to find the key. If you delete often, is a heap the right data structure?

Also, key_index being an array means that keys must be small non-negative integers. Consider using a hash table instead.

What if you want to insert() a value that is larger than what fits in key_index? You don't check for this in insert()!

Question 10

Avoid FP math for an integer problem

floor(log2(max_heap->size)) is overkill and at some level risks incorrect results with less than ideal FP functions. A simply loop will suffice.

unsigned log2 = 0;
unsigned v = max_heap->size;
while (v >>= 1) {
 log2++;
}
return log2;

Fast hacks.

Name space

Rather than names all over the place like

...
void increase_key()
int* build_max_heap()
void insert()
...

Move your "13 operations" into a nearby namespace.

...
void mh_increase_key()
int* mh_build_max_heap()
void mh_insert()
 mh_...()
...

Question 11

You linked to the bit-twiddling hacks page, but you showed the naive approach used to validate the actual bit-twiddling hacks. Kind of missing the point there, eh? :-) Really, what you want is for the compiler to generate a CPU instruction that produces the value directly by counting the number of leading zero bits. If you're on GCC, use the __builtin_clz intrinsic. On MSVC, use _BitScanReverse. See also: codereview.stackexchange.com/a/151331

Question 12

@CodyGray I prefer to use non-compiler specific code. I have found good compilers often see standard code and emit efficient code anyways, so a reduced need for me to do the compiler's job.

Question 13

That's a good rule-of-thumb, but, as far as I know, doesn't apply in this case. Do you know otherwise? Certainly, the code you showed above is not recognized as idiomatic by any compiler that I know of and is therefore not transformed into the expected BSR instruction, even on a bog-standard x86 target. I did search when I wrote the above comment to see if I could find a good Stack Overflow Q&A to point to, but I couldn't find one that touched upon the correct way to write the C code to get the compiler to generate efficient code. The only solutions offered were intrinsics.

J_H J_H 41.8k3 gold badges38 silver badges157 bronze badges · Accepted Answer · 2023-03-05 16:37:14Z

This is a max-heap datastructure, and there is very consistent nomenclature here which reflects that. I'm sure it is clear and commendable. But I wouldn't have minded seeing it written in a comment, once, and then we just call it a heap in the code. (Or perhaps there's some design document, not in the submission, that talks about future plans for a min-heap.)

Consider adopting a code style (maybe GNU? or Google?) and enforcing it with a linter or pretty printer. There are well-known issues that stem from neglecting to adorn single-line if / for statements with curly braces. Type a couple of "extra" { } characters -- it will be worth it in the end!

I guess this looks mostly plausible:

int height(MaxHeap* max_heap) {
 return floor(log2(max_heap->size));
}

It's not obvious to me that most callers would expect a floor result instead of a ceil result. But there's no comment documentation specifying its behavior, and we can only report a bug w.r.t. some written spec, so this code's behavior must be correct. We're not restricted to power-of-two sizes, and it's nice that that is pretty clear.

There is no warning that caller is responsible for verifying non-empty prior to calling this, for example we see that maxheap_init is valid but empty. I can't say that I feel good about returning negative HUGE_VAL in the empty case, since 0 seems the most natural response for that.

This submission includes a demo but no unit tests. In particular, we see no self-evaluating tests that would offer concrete example return values for particular heaps. Adding such test(s) would bring documentation value.

max_heapify's recursive call to itself is Fine.

If this was implemented in scheme, it's a sure thing the tail recursion would be optimized, with no extra stack usage.

This submission does not include a Makefile or build script that shows which compiler is used with which compilation flags.

For some compiler target of interest, it would be useful for a comment to mention whether recursion costs us stack space here or not. A caller might know that it is tight on stack space and would want to know if it is safe to heapify. A https://godbolt.org link could suffice.

Alternatively, you might prefer to code a while loop here.

insert should probably report fatal error if caller violates this constraint:

N_INF < key < INF

Given the void signature, I can't say I get warm fuzzies about how a practical application would behave once insert exhausts memory. At a minimum we need some comments describing caller's responsibility for checking errno.

 temp = 0;

What's that all about? Leftover debug? This isn't java, it's not like nulling out the pointer will enable some garbage collection.

Making extract_max void seems odd. Yes, caller could grab the value prior to the call. But it seems natural to return the extracted value, no?

Or maybe this should have been a private helper for delete_key ?

In clear, "assume" seems like the wrong verb.

 max_heap->heap[i] = N_INF; //assuming it is never in the heap

As touched on above, restricted range of heap values is a fundamental invariant of this datastructure. We "enforce" what range those values are in. Prefer a verb like "shall never be in the heap".

print_heap does this:

 ... max_heap->heap[i], max_heap->heap[2*i+1], max_heap->heap[2*i+2]);

Rather than those expressions, prefer the left / right helpers.

Also, is there an opportunity to use the parent helper, here?

The is_maxheap identifier is suggestive of Boolean, very nice, thank you. But couldn't we offer a return type in the signature that is narrower than signed integer?

print_arr does this:

 printf("%d ", max_heap->heap[i]);

Given the limited range of heap values, it might be convenient to impose a fixed number of display columns for each decimal value. Then they'd have a better chance of lining up when wrapped at the terminal's line length.

main tidies up after itself:

 free(data);
 free(key_index);

I am looking at the allocation in create_maxheap. But we never called that, preferring build_max_heap instead. Maybe delete some vestigial code?

main correctly verifies allocations, and silently drops out upon failure. Not the friendliest behavior -- it is likely to increase the cost of supporting this code in the field when problem reports hit the helpdesk. Consider linking against allocation routines that are a bit more on the chatty side when things go south.

Overall?

This looks fairly solid. I would be willing to delegate or accept maintenance tasks for this code base. Ship it!

Thanks very much, and I even learnt some English along the way ! You mean I don t need to free in main?
I think free'ing in main is an excellent habit, which I encourage. Maybe someone links another app against this, or renames it, or copy-n-pastes code -- the tidying up is entirely appropriate. I was just doing an inventory: Let's see, alloc/free, alloc/free, alloc, wait, what happened there? Oh, yeah! Never called, so not an issue. And then I wondered if you maybe want to delete a function that used to be called but no longer is.
One suggestion, dumb down the answers a little bit. Quite a number of our users are English as a second language. A number are still in high school if they are native English speakers. I do understand everything you write but I speak American English and I went to a university to study computer science. Many of our users are hobbyists. Plus 1.

Stack Exchange Network

Max-heap implementation in C

4 Answers 4

General Observations

Contain Logic Blocks in Curly Braces

Use Stronger Compiler Warnings

Prefer `size_t` Over `int` for Size Variables

Readability and Maintainability

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Linked

Hot Network Questions

Max-heap implementation in C

4 Answers 4

General Observations

Contain Logic Blocks in Curly Braces

Use Stronger Compiler Warnings

Prefer size_t Over int for Size Variables

Readability and Maintainability

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Linked

Related

Hot Network Questions

Prefer `size_t` Over `int` for Size Variables