I decided to take up learning c++ recently, so I coded a Huffman compression algorithm. I'm particularly interested in critique of my c++ techniques (pointers, references, best practices, etc), but I am open to any comments you have.
Thanks for you time!
#include <map>
#include <iostream>
#include <vector>
#include <deque>
#include <string>
#include <algorithm>
typedef std::pair<char, int> weight_pair;
struct node {
int weight;
node *parent = NULL;
node *children[2] = {NULL, NULL};
std::string content;
};
bool compareNodeWeights(const node *node1, const node *node2) {
return (node1 -> weight) < (node2 -> weight);
}
//builds a binary tree from an alphabet and an associated set of weights
node *build_tree(const std::map<char, int> &weights){
std::deque<node*> nodes;
for (auto const &x : weights) {
node *leaf_node = new node;
leaf_node -> weight = x.second;
leaf_node -> content = std::string(1, x.first);
nodes.push_back(leaf_node);
}
while (nodes.size() > 1) {
std::sort(nodes.begin(), nodes.end(), compareNodeWeights);
node* new_node = new node;
new_node -> weight = nodes[0] -> weight + nodes[1] -> weight;
new_node -> content = nodes[0] -> content + nodes[1] -> content;
nodes[0] -> parent = new_node;
nodes[1] -> parent = new_node;
new_node -> children[0] = nodes[0];
new_node -> children[1] = nodes[1];
nodes.erase(nodes.begin());
nodes.erase(nodes.begin());
nodes.push_back(new_node);
}
return nodes[0];
}
//traverses the tree to find the prefix code for a given leaf node in a tree
std::deque<bool> find_prefix_code(const node *leaf) {
std::deque<bool> result;
const node *curr_node = leaf;
while (curr_node -> parent != NULL) {
if (curr_node -> parent -> children[0] -> content == curr_node -> content) {
result.push_front(0);
}
else {
result.push_front(1);
}
curr_node = curr_node -> parent;
}
return result;
}
std::vector<bool> compress(const std::string &message, const node *tree) {
std::vector<bool> result;
std::vector<const node*> open;
open.push_back(tree);
std::map<char, std::deque<bool>> prefix_codes;
while (open.size() > 0) {
const node* curr_node = open[0];
if (curr_node -> content.size() == 1){
prefix_codes.insert( std::pair<char, std::deque<bool>>(curr_node -> content[0], find_prefix_code(curr_node)) );
}
else {
open.push_back(curr_node -> children[0]);
open.push_back(curr_node -> children[1]);
}
open.erase(open.begin());
}
for (const char c : message) {
for (const bool &b : prefix_codes[c]) {
result.push_back(b);
}
}
return result;
}
std::string decompress(const std::vector<bool> &data, const node *tree) {
const node *curr_node = tree;
std::string result = "";
for (const bool b : data) {
int direction = b;
curr_node = curr_node -> children[direction];
if (curr_node ->content.size() == 1) {
result += curr_node -> content;
curr_node = tree;
}
}
return result;
}
void print_compressed(const std::vector<bool> &data) {
std::cout << "Compressed data: ";
for (const bool b : data) {
std::cout << b;
}
std::cout << std::endl;
}
void delete_tree(node *tree) {
for (int i = 0; i <= 1; i++) {
if (tree -> children[i] != NULL) {
delete_tree(tree -> children[i]);
}
}
delete tree;
}
int main() {
std::map<char, int> weights;
weights.insert(weight_pair(' ', 3));
weights.insert(weight_pair('a', 3));
weights.insert(weight_pair('d', 3));
weights.insert(weight_pair('b', 1));
weights.insert(weight_pair('c', 1));
node *tree = build_tree(weights);
std::vector<bool> compressed_message = compress("a cab dab bad", tree);
print_compressed(compressed_message);
std::string message = decompress(compressed_message, tree);
std::cout << "Decompressed data: " << message << std::endl;
delete_tree(tree);
return 0;
}
1 Answer 1
You have a typedef for weight_pair but only use it in main to fill the map.
node::children should be unique_ptr. That way you don't need delete_tree. However you will need at most 2*n nodes to be allocated so you can preallocate those in a std::vector<node> and avoid calling make_unique on each new node.
In build_tree you pull that map apart to build a node* array so you may as well have just passed a std::vector<weight_pair>.
You can avoid using the std::deck by reverse sorting a std::vector (so the lowest elements end up at the back ready to get popped of).
while (nodes.size() > 1) {
std::sort(nodes.begin(), nodes.end(), reverseCompareNodeWeights);
unique_ptr<node> new_node = std::make_unique<node>();
//or node* new_node = allocated_nodes[next++]; // if preallocated.
unique_ptr<node>& back1 = nodes[nodes.size()-1];
unique_ptr<node>& back2 = nodes[nodes.size()-2];
new_node -> weight = back1 -> weight + back2 -> weight;
new_node -> content = back2 -> content + back2 -> content;
back1->parent = new_node;
back2->parent = new_node;
new_node -> children[0] = std::move(back1);
new_node -> children[1] = std::move(back2);
nodes.pop_back();
nodes.back(std::move(new_node));
}
Or you could use the std::heap operations
std::make_heap(nodes.begin(), nodes.end(), compareNodeWeights);
while (nodes.size() > 1) {
std::pop_heap(nodes.begin(), nodes.end(), compareNodeWeights);
std::pop_heap(nodes.begin(), nodes.end()-1, compareNodeWeights);
//identical to above
nodes.pop_back();
nodes.back() = std::move(new_node);
std::push_heap(nodes.begin(), nodes.end(), compareNodeWeights);
}
Compressing or decompressing bit by bit using this tree is going to be very slow. It will result in a cache miss per bit of output.
instead you can make a lookup table. For compression this is straightforward it will be a std::array<compress_value> where compress_value is
struct compress_value {
uint code;
uint code_size;
}
and the compression main loop will be:
std::vector<uint8> output;
uint64 outputbuff; //filled from least significant bit first
uint filled;
for(char c : input){
compress_value value = compress_table[c];
outputbuff |= value.code << filled;
filled += value.code_size;
while(filled > 8){
output.pushback(outputbuff & 0xff);
outputbuff = outputbuff >> 8;
filled -= 8;
}
}
Decompressing will be similar. But instead you will have a lookup table that is as large as \$ 2^{\text{max code size}}\$
Each entry in the decompression table will contain at index i the character where i & mask is the code for the value.
That is
for(table_value value : table){
for(uint c = value.code; c < table_size; c += 1<<value.code_size){
decompress_table[c].ch = value.ch;
decompress_table[c].code_size = value.code_size;
}
}
The decompression main loop will be:
uint64 input_buff = read_up_to_8_bytes(input, end); //reads least significant byte first
uint filled = 64;
while(input < end){
decompress_value value = decompress_table[input_buff & decompress_mask];
output.push_back(value.ch);
input_buff = input_buff >> value.code_size;
filled -= value.code_size;
if(filled < max_code_size){
while(filled < 56 && (input != end)){
input++;
filled += 8;
}
input_buff = read_up_to_8_bytes(input, end);
if(filled != 0)
input_buff = input_buff >> (64-filled);
}
}
instead of all those explicit bounds checks you can add a new symbol that signifies the end of the bit stream and overallocate the input buffer by at least 8 bytes. Though that requires that the stream was not corrupted. The compromise is to only have the bounds check on the outer loop.
forloop. \$\endgroup\$