I have tried to implement my Heap in C. The following are the 13 operations defined:
- build_maxheap
- insert
- exctract_max (delete heap max root)
- max_delete (delete an element or key)
- max_heapify
- clear
- heapSort
- get_max
- increase_key (helper function for insert and delete key functions)
- height
- is_empty
- is_maxheap (checks if the array is a heap)
This is the code in C:
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>
typedef struct MaxHeap {
int size;
int* heap;
} MaxHeap;
const MaxHeap maxheap_init = { .size = 0, .heap = NULL };
int INF = 1000, N_INF = -1000;
void swap(int* a, int* b) {
int temp = *a;
*a = *b;
*b = temp;
}
int parent(int i) {
return (i-1)/2;
}
int left(int i) {
return (2*i + 1);
}
int right(int i) {
return (2*i + 2);
}
int get_max(MaxHeap* max_heap) { //O(1)
return max_heap->heap[0];
}
int is_empty(MaxHeap* max_heap) {
return max_heap->heap[0] == N_INF;
}
int height(MaxHeap* max_heap) {
return floor(log2(max_heap->size));
}
MaxHeap* create_maxheap(int size) {
MaxHeap* heap = calloc(size, sizeof * heap);
if (!heap) return heap;
heap->size = size;
return heap;
}
void max_heapify(int* data, int* key_index, int i, int size) { // O(logn)
int largest = i, leftie = left(i), rightie = right(i);
if (leftie < size && data[leftie] > data[largest])
largest = leftie;
if (rightie < size && data[rightie] > data[largest])
largest = rightie;//transitvity
if (largest != i) {
swap(&data[i], &data[largest]);
//
key_index[data[i]] = i;
key_index[data[largest]] = largest;
//
max_heapify(data, key_index, largest, size);
}
}
void increase_key(MaxHeap* max_heap, int* key_index, int j, int key) {
if (key < max_heap->heap[j])
printf("Error: key must be larger \n");
else {
max_heap->heap[j] = key;
key_index[key] = j;
for (int i = j; i > 0; i = parent(i)) {
if (max_heap->heap[i] > max_heap->heap[parent(i)]) {
swap(&max_heap->heap[i], &max_heap->heap[parent(i)]);
key_index[max_heap->heap[i]] = i;
key_index[max_heap->heap[parent(i)]] = parent(i);
}
else break;
}
}
}
int* build_max_heap(int* data, int* key_index, int size) { //O(n)
for (int i = size / 2 - 1; i >= 0; i--) {
max_heapify(data, key_index, i, size);
}
return data;
}
void insert(MaxHeap* max_heap, int* key_index, int key) { // O(logn)
int* temp = realloc (max_heap->heap, (max_heap->size + 1) * sizeof (*(max_heap->heap)));
if (temp) {
max_heap->heap = temp;
max_heap->heap[max_heap->size] = N_INF;
increase_key(max_heap, key_index, max_heap->size, key);
max_heap->size += 1;
temp = 0;
}
}
void extract_max(MaxHeap* max_heap, int* key_index) { // O(logn)
swap(&max_heap->heap[0], &max_heap->heap[max_heap->size - 1]);
max_heap->size -= 1;
max_heapify(max_heap->heap, key_index, 0, max_heap->size);
}
void delete_key(MaxHeap* max_heap, int* key_index, int key) { //O(logn)
int index = key_index[key];
increase_key(max_heap, key_index, index, INF);
extract_max(max_heap, key_index);
}
void clear(MaxHeap* max_heap) {
for (int i = 0; i < max_heap->size; i++)
max_heap->heap[i] = N_INF; //assuming it is never in the heap
max_heap->size = 0;
}
void print_heap(MaxHeap* max_heap) { // O(n)
int size = max_heap->size;
if (size % 2 != 0)
size = size + 1;
for (int i = 0; i < size/2 - 1; i++) {
printf("Parent: %d -> Left: %d | Right: %d \n", max_heap->heap[i], max_heap->heap[2*i+1], max_heap->heap[2*i+2]);
}
int j = max_heap->size/2 - 1;
if (max_heap->size % 2 == 0)
printf("Parent: %d -> Left: %d \n", max_heap->heap[j], max_heap->heap[2*j+1]);
}
int is_maxheap(MaxHeap* max_heap) { // O(n)
for (int i = max_heap->size-1; i > 0 ; i--) {
if (max_heap->heap[i] > max_heap->heap[parent(i)]) {
return 0;
}
}
return 1;
}
void heap_sort(MaxHeap* max_heap, int* key_index) { // assumes already a max heap as argument
// if the input is not a heap, call build_max_heap()
for (int i = max_heap->size - 1; i >= 0; i--) {
swap(&max_heap->heap[0], &max_heap->heap[i]);
max_heapify(max_heap->heap, key_index, 0, i);
}
}
void print_arr(MaxHeap* max_heap) {
for (int i = 0; i < max_heap->size; i++)
printf("%d ", max_heap->heap[i]);
printf("\n");
}
int main() {
int size = 6, MAX_Key = INF, elements[6] = {10, 20, 15, 12, 40, 25};
int* data = calloc(size, sizeof * data);
if (!data) return 0;
for (int i = 0; i < size; i++)
data[i] = elements[i];
int* key_index = calloc(MAX_Key, sizeof * key_index); // we assume key > 0, if key < 0, then we add an additional array
if (!key_index) return 0;
for (int i = 0; i < size; i++)
key_index[elements[i]] = i;
MaxHeap max_heap = maxheap_init;
max_heap.size = size;
max_heap.heap = build_max_heap(data, key_index, size);
print_heap(&max_heap);
printf("is_max_heap: %d \n", is_maxheap(&max_heap));
extract_max(&max_heap, key_index);
printf("is_max_heap: %d \n", is_maxheap(&max_heap));
print_heap(&max_heap);
insert(&max_heap, key_index, 14);
printf("is_max_heap: %d \n", is_maxheap(&max_heap));
print_heap(&max_heap);
delete_key(&max_heap, key_index, 14);
printf("is_max_heap: %d \n", is_maxheap(&max_heap));
print_heap(&max_heap);
printf("height: %d \n", height(&max_heap));
printf("max: %d \n", get_max(&max_heap));
heap_sort(&max_heap, key_index);
print_arr(&max_heap);
clear(&max_heap);
printf("is empty: %d \n", is_empty(&max_heap));
free(data);
free(key_index);
return 0;
}
If you have any improvement ideas, I would be very grateful to read through them. I should note that this implementation is for learning purposes, not for a client or something. There might be some place when I missed some secondary check for an empty array or something, but I am interested in seeing if the code holds or not. I tested it and it works just fine.
4 Answers 4
This is a max-heap datastructure,
and there is very consistent nomenclature here which reflects that.
I'm sure it is clear and commendable.
But I wouldn't have minded seeing it written in a comment, once,
and then we just call it a heap
in the code.
(Or perhaps there's some design document, not in the submission,
that talks about future plans for a min-heap.)
Consider adopting a code style (maybe GNU? or Google?)
and enforcing it with a linter or pretty printer.
There are well-known issues that stem from
neglecting to adorn single-line if
/ for
statements
with curly braces.
Type a couple of "extra" {
}
characters -- it will
be worth it in the end!
I guess this looks mostly plausible:
int height(MaxHeap* max_heap) {
return floor(log2(max_heap->size));
}
It's not obvious to me that most callers would
expect a floor
result instead of a ceil
result.
But there's no comment documentation specifying its behavior,
and we can only report a bug w.r.t. some written spec,
so this code's behavior must be correct.
We're not restricted to power-of-two sizes,
and it's nice that that is pretty clear.
There is no warning that caller is responsible
for verifying non-empty prior to calling this,
for example we see that maxheap_init
is valid but empty.
I can't say that I feel good about returning negative
HUGE_VAL in the empty
case, since 0
seems the most natural response for that.
This submission includes a demo but no unit tests. In particular, we see no self-evaluating tests that would offer concrete example return values for particular heaps. Adding such test(s) would bring documentation value.
max_heapify
's recursive call to itself is Fine.
If this was implemented in scheme, it's a sure thing the tail recursion would be optimized, with no extra stack usage.
This submission does not include a Makefile or build script that shows which compiler is used with which compilation flags.
For some compiler target of interest, it would be useful for a comment to mention whether recursion costs us stack space here or not. A caller might know that it is tight on stack space and would want to know if it is safe to heapify. A https://godbolt.org link could suffice.
Alternatively, you might prefer to code a while
loop here.
insert
should probably report fatal error
if caller violates this constraint:
N_INF
< key
< INF
Given the void
signature,
I can't say I get warm fuzzies about how a practical
application would behave once insert
exhausts memory.
At a minimum we need some comments describing caller's
responsibility for checking errno
.
temp = 0;
What's that all about? Leftover debug? This isn't java, it's not like nulling out the pointer will enable some garbage collection.
Making extract_max
void seems odd.
Yes, caller could grab the value prior to the call.
But it seems natural to return the extracted value, no?
Or maybe this should have been a private helper
for delete_key
?
In clear
, "assume" seems like the wrong verb.
max_heap->heap[i] = N_INF; //assuming it is never in the heap
As touched on above, restricted range of heap values is a fundamental invariant of this datastructure. We "enforce" what range those values are in. Prefer a verb like "shall never be in the heap".
print_heap
does this:
... max_heap->heap[i], max_heap->heap[2*i+1], max_heap->heap[2*i+2]);
Rather than those expressions, prefer the left
/ right
helpers.
Also, is there an opportunity to use the parent
helper, here?
The is_maxheap
identifier is suggestive of Boolean,
very nice, thank you.
But couldn't we offer a return type in the signature
that is narrower than signed integer?
print_arr
does this:
printf("%d ", max_heap->heap[i]);
Given the limited range of heap values, it might be convenient to impose a fixed number of display columns for each decimal value. Then they'd have a better chance of lining up when wrapped at the terminal's line length.
main
tidies up after itself:
free(data);
free(key_index);
I am looking at the allocation in create_maxheap
.
But we never called that, preferring build_max_heap
instead.
Maybe delete some vestigial code?
main
correctly verifies allocations,
and silently drops out upon failure.
Not the friendliest behavior -- it is likely
to increase the cost of supporting this code
in the field when problem reports hit the helpdesk.
Consider linking against allocation routines that are
a bit more on the chatty side when things go south.
Overall?
This looks fairly solid. I would be willing to delegate or accept maintenance tasks for this code base. Ship it!
-
\$\begingroup\$ Thanks very much, and I even learnt some English along the way ! You mean I don t need to free in main? \$\endgroup\$V_head– V_head2023年03月05日 17:04:21 +00:00Commented Mar 5, 2023 at 17:04
-
2\$\begingroup\$ I think
free
'ing inmain
is an excellent habit, which I encourage. Maybe someone links another app against this, or renames it, or copy-n-pastes code -- the tidying up is entirely appropriate. I was just doing an inventory: Let's see, alloc/free, alloc/free, alloc, wait, what happened there? Oh, yeah! Never called, so not an issue. And then I wondered if you maybe want to delete a function that used to be called but no longer is. \$\endgroup\$J_H– J_H2023年03月05日 17:10:17 +00:00Commented Mar 5, 2023 at 17:10 -
2\$\begingroup\$ One suggestion, dumb down the answers a little bit. Quite a number of our users are English as a second language. A number are still in high school if they are native English speakers. I do understand everything you write but I speak American English and I went to a university to study computer science. Many of our users are hobbyists. Plus 1. \$\endgroup\$2023年03月05日 18:23:40 +00:00Commented Mar 5, 2023 at 18:23
-
8\$\begingroup\$ Please don't dumb down the answers :-) \$\endgroup\$Stef– Stef2023年03月06日 21:29:52 +00:00Commented Mar 6, 2023 at 21:29
General Observations
Since this might be a set of library routines, it might be a good idea to move all the heap related functions to their own .c
file and have a header file (.h
) with function prototypes and struct declarations. Functions such as swap()
, parent()
, left()
and right()
should be contained within the heap .c
and declared as static so that they are not global functions(). This will allow other data structures to use similar function names if necessary.
Contain Logic Blocks in Curly Braces
In the review by J_H it is mentioned that all then
and else
clauses of an if statement as well as all loops should be contained within curly braces ({
and }
). This is actually a best practice for C and C++. All the companies that I have worked for have had coding standards that required this.
The reason that this is a best practice is that one of the most common errors in code maintenance is the failure to create a logic block when a statement is added to an if
statement. It is better to make maintenance coding easier by putting all code into logic blocks to begin with.
Use Stronger Compiler Warnings
As J_H mentioned, we don't know what compiler you are using, but if you are using gcc
you should add some additional warning flags to improve your code. These are the flags I would recommend:
-pendantic
-Wall
Wextra
-Wconversion
These flags found the following warnings in the code:
In function 'height':
48:12: warning: conversion from 'double' to 'int' may change value [-Wfloat-conversion] return floor(log2(max_heap->size));In function 'create_maxheap':
52:28: warning: conversion to 'size_t' {aka 'long unsigned int'} from 'int' may change the sign of the result [-Wsign-conversion]52 | MaxHeap* heap = calloc(size, sizeof * heap);
In function 'insert':
100:63: warning: conversion to 'long unsigned int' from 'int' may change the sign of the result [-Wsign-conversion]
100 | int* temp = realloc (max_heap->heap, (max_heap->size + 1) * sizeof (*(max_heap->heap)));In function 'main':
166:24: warning: conversion to 'size_t' {aka 'long unsigned int'} from 'int' may change the sign of the result [-Wsign-conversion]
166 | int* data = calloc(size, sizeof * data);170:29: warning: conversion to 'size_t' {aka 'long unsigned int'} from 'int' may change the sign of the result [-Wsign-conversion]
170 | int* key_index = calloc(MAX_Key, sizeof * key_index); // we assume key > 0, if key < 0, then we add an additional array
Prefer size_t
Over int
for Size Variables
There is a special unsigned integer type used for the size of allocated memory and indexes in arrays. The C operator sizeof()
returns the type size_t. The implementation of size_t
varies by system but it always stores the maximum size of an array for a system. It is used for indexing arrays because it can't become negative, negative indexes can cause Undefined Behavior which can cause bugs.
The size_t
type is the preferred input for malloc()
, calloc()
and realloc()
.
typedef struct MaxHeap {
size_t size;
int* heap;
} MaxHeap;
Readability and Maintainability
For readability and maintainability each variable should be declared and initialized on its own line.
Line 19:
int INF = 1000, N_INF = -1000;
Should be:
int INF = 1000;
int N_INF = -1000;
Line 59:
int largest = i, leftie = left(i), rightie = right(i);
Should be:
size_t largest = i;
size_t leftie = left(i);
size_t rightie = right(i);
In main(), line 167:
int size = 6, MAX_Key = INF, elements[6] = {10, 20, 15, 12, 40, 25};
should be changed as follows, you don't need the [6] for elements and you can calculate the size:
int elements[] = {10, 20, 15, 12, 40, 25};
size_t size = (sizeof(elements)/sizeof(elements[0]));
size_t MAX_Key = INF;
-
1\$\begingroup\$ The "operator
sizeof()
" is actually justsizeof
, as I'm sure you know. \$\endgroup\$Toby Speight– Toby Speight2023年03月08日 15:22:31 +00:00Commented Mar 8, 2023 at 15:22
The biggest issue I see here, and is not yet commented on, is the call to realloc
for every insertion. When you remove an element from the heap, the size
value is changed but the memory block (array) is not.
So when you remove one element and then insert another, the realloc
call does nothing, as the array has the right size. But what happens if you remove two elements, and then insert one? realloc
will reduce the array size by 1. For the next insertion the array has to grow again. Hopefully your implementation of realloc
will not look for a new memory block here and copy data over...
In any case, increasing the array size by 1 every time you insert an element is expensive, it makes the insertion operation have a cost of O(n) instead of O(log n), because the array needs to be copied over. Typically we double the array size when we run out of space, so that the insertion operation remains of O(log n). You can accomplish this by separating the array size and the number of elements in the heap. This is slightly more bookkeeping, for the benefit of faster insertions, and no need for that marker value (the heap is empty if the "number of elements" variable is 0). For example:
typedef struct MaxHeap {
int* heap; // pointer to array
int size; // number of elements stored in array
int capacity; // size of array
} MaxHeap;
Another issue is that the heap insertion reallocates the array, changing the max_heap.heap
pointer, but not the data
pointer in main()
, which now might point to deallocated memory. At the end of the program, data
is freed (a potential exception), but not max_heap.heap
(a potential memory leak).
I think it is cleaner if MaxHeap
owns the data it points to, and at the end of the program you call some function that frees the data in the heap, say free_maxheap(max_heap)
. For best effect, you'd actually use create_maxheap()
, so that the heap allocates its own data array. You should not have a function like build_max_heap()
that takes a pointer from the caller and keeps it. Currently it is unclear to the caller: can I free the pointer after calling build_max_heap()
? No, because the pointer is still being used, though this is not documented anywhere. It is much cleaner if the caller passes a pointer to data, but the data is copied over into the heap. If for efficiency you want to have a function that takes ownership of the input pointer, then this needs to be clearly documented, both in comments and in the function name (maybe create_maxheap_with_array()
?).
I really don't like the key_index
array being a separate data structure that needs to be passed into all functions that work on the heap. It needs to be part of the heap data structure.
It seems that the key_index
array is only read in delete_key()
, everywhere else it is just updated. This is a lot of bookkeeping for the benefit of being able to delete a specific value from the heap in O(log n). I've never written a heap with this ability, I don't know if I'd do it the same way. If an O(n) delete is OK (you don't delete often), then consider just looking through the heap to find the key. If you delete often, is a heap the right data structure?
Also, key_index
being an array means that keys must be small non-negative integers. Consider using a hash table instead.
What if you want to insert()
a value that is larger than what fits in key_index
? You don't check for this in insert()
!
Avoid FP math for an integer problem
floor(log2(max_heap->size))
is overkill and at some level risks incorrect results with less than ideal FP functions. A simply loop will suffice.
unsigned log2 = 0;
unsigned v = max_heap->size;
while (v >>= 1) {
log2++;
}
return log2;
Name space
Rather than names all over the place like
...
void increase_key()
int* build_max_heap()
void insert()
...
Move your "13 operations" into a nearby namespace.
...
void mh_increase_key()
int* mh_build_max_heap()
void mh_insert()
mh_...()
...
-
\$\begingroup\$ You linked to the bit-twiddling hacks page, but you showed the naive approach used to validate the actual bit-twiddling hacks. Kind of missing the point there, eh? :-) Really, what you want is for the compiler to generate a CPU instruction that produces the value directly by counting the number of leading zero bits. If you're on GCC, use the
__builtin_clz
intrinsic. On MSVC, use_BitScanReverse
. See also: codereview.stackexchange.com/a/151331 \$\endgroup\$Cody Gray– Cody Gray2023年03月08日 21:17:54 +00:00Commented Mar 8, 2023 at 21:17 -
\$\begingroup\$ @CodyGray I prefer to use non-compiler specific code. I have found good compilers often see standard code and emit efficient code anyways, so a reduced need for me to do the compiler's job. \$\endgroup\$chux– chux2023年03月08日 21:24:21 +00:00Commented Mar 8, 2023 at 21:24
-
\$\begingroup\$ That's a good rule-of-thumb, but, as far as I know, doesn't apply in this case. Do you know otherwise? Certainly, the code you showed above is not recognized as idiomatic by any compiler that I know of and is therefore not transformed into the expected
BSR
instruction, even on a bog-standard x86 target. I did search when I wrote the above comment to see if I could find a good Stack Overflow Q&A to point to, but I couldn't find one that touched upon the correct way to write the C code to get the compiler to generate efficient code. The only solutions offered were intrinsics. \$\endgroup\$Cody Gray– Cody Gray2023年03月09日 08:56:11 +00:00Commented Mar 9, 2023 at 8:56