I'm currently writing a Huffman compressor for educational purposes.
I want it to compress/decompress files less then 5mb. The time limit is 5 seconds.
Now it compresses a 5mb file in almost 8 seconds. The decompression speed is really awful. It requires almost 6 seconds on a file of the size approx 0.1mb.
The problem is in reading/writing. I can get all code tables in less than 1 second, but then writing subroutine works about 6 seconds on 5mb file.
How can I improve speed of compression/decompression?
main.cpp
#include <iostream>
#include <bits/stdc++.h>
#include "huffman.h"
#include "archiver.h"
using namespace std::chrono;
int main() {
high_resolution_clock::time_point t1 =
high_resolution_clock::now();
Archiver ar;
//encoding
/*
std::map<char, int32_t> m;
ar.createFreqTable("test.pdf", m);
HuffmanTree t(m);
std::ifstream ifs("test.pdf");
std::ofstream ofs("test.out");
ar.compress(ifs, ofs, &t);
*/
//decoding
// /*
//test.out - 123 930 bytes
HuffmanTree nt;
std::ifstream ifs2("test.out");
std::ofstream ofs2("result.pdf");
ar.decompress(ifs2, ofs2, &nt);
//*/
high_resolution_clock::time_point t2 = high_resolution_clock::now();
auto duration = duration_cast<milliseconds>( t2 - t1 ).count();
std::cout << duration; // 5349
return 0;
}
archiver.h
#include "huffman.h"
#include "bitstring.h"
#ifndef HUFFMAN_ARCHIVER_H
#define HUFFMAN_ARCHIVER_H
class Archiver{
private:
std::map<std::vector <bool>, char> codes;
std::map<char, std::vector<bool> > lookup;
public:
Archiver(){};
void compress(std::ifstream&, std::ofstream&, HuffmanTree*);
void decompress(std::ifstream&, std::ofstream&, HuffmanTree*);
void encodeTree(BitStringWrite&, TreeNode*);
TreeNode* decodeTree(BitStringRead&);
void buildCodes(TreeNode*, std::vector<bool>);
std::map<std::vector <bool>, char>& getCodes(){
return codes;
};
std::map<char, int32_t>& createFreqTable(const std::string&, std::map<char, int32_t>&);
void buildTable(TreeNode*);
std::map<char, std::vector<bool> >& getTable(){
return lookup;
};
};
#endif //HUFFMAN_ARCHIVER_H
archiver.cpp
#include "archiver.h"
#include <fstream>
#include <deque>
std::map<char, int32_t>& Archiver::createFreqTable(const std::string &name, std::map<char, int32_t>& freq){
std::ifstream file(name);
int next = 0;
while ((next = file.get()) != EOF) {
char uc = static_cast <char> (next);
std::map<char, int32_t>::iterator iter;
iter = freq.find(uc);
if (iter != freq.end())
iter->second += 1;
else
freq[uc] = 1;
}
return freq;
};
void Archiver::encodeTree(BitStringWrite& bw, TreeNode* node){
if (node -> isLeaf()) {
bw.writeBit(1);
bw.writeByte(node->getChar());
return;
}
else {
bw.writeBit(0);
encodeTree(bw, node->getLeftTree());
encodeTree(bw, node->getRightTree());
}
}
TreeNode* Archiver::decodeTree(BitStringRead& br){
if (br.readBit()) {
return new TreeNode(br.readByte(), 0, true, NULL, NULL);
}
else {
TreeNode* left = decodeTree(br);
TreeNode* right = decodeTree(br);
return new TreeNode(0, 0, false, left, right);
}
}
void Archiver::buildCodes(TreeNode* n, std::vector<bool> cur) {
if (n -> isLeaf()) {
codes[cur] = n->getChar();
return;
}
cur.push_back(0);
buildCodes(n->getLeftTree(), cur);
cur.pop_back();
cur.push_back(1);
buildCodes(n->getRightTree(), cur);
return;
}
void Archiver::buildTable(TreeNode* root) {
std::deque< std::pair<TreeNode *, std::vector<bool> > > q;
q.push_back(make_pair(root, std::vector<bool>()));
while (!q.empty()) {
TreeNode *node, *lc, *rc;
std::vector<bool> code;
node = q.front().first;
code = q.front().second;
q.pop_front();
lc = node->getLeftTree();
rc = node->getRightTree();
if (lc) {
std::vector<bool> code_cp(code);
q.push_back(make_pair(lc, (code.push_back(0), code)));
q.push_back(make_pair(rc, (code_cp.push_back(1), code_cp)));
}
else
lookup.insert(make_pair(node->getChar(), code));
}
}
void Archiver::compress(std::ifstream &ifs, std::ofstream &ofs, HuffmanTree *tree) {
ifs.clear();
ifs.seekg(0, ifs.beg);
BitStringRead br(ifs);
BitStringWrite bw(ofs);
buildTable(tree->getRoot());
encodeTree(bw, tree -> getRoot());
while(!ifs.eof()){
br.readByte();
int sz = getTable()[br.getByte()].size();
std::vector<bool> out = getTable()[br.getByte()];
for(int i = 0; i < sz; i++)
bw.writeBit(out[i]);
}
}
void Archiver::decompress(std::ifstream &ifs, std::ofstream &ofs, HuffmanTree *tree) {
ifs.clear();
ifs.seekg(0, ifs.beg);
BitStringRead br(ifs);
BitStringWrite bw(ofs);
TreeNode* t = decodeTree(br);
std::vector<bool> cur;
buildCodes(t, cur);
std::vector<bool> v;
bool b = false;
while(!ifs.eof()) {
while (!(getCodes().count(v)) && !ifs.eof()) {
b = br.readBit();
v.push_back(b);
}
if (ifs.eof())
break;
char s = getCodes()[v];
v.clear();
bw.writeByte(s);
}
}
huffman.h
#ifndef HUFFMAN_HUFFMAN_H
#define HUFFMAN_HUFFMAN_H
#include <sys/param.h>
#include <iostream>
#include <map>
#include <vector>
class TreeNode{
public:
TreeNode(char c, int cnt, bool l, TreeNode* lc, TreeNode* rc): character(c), count(cnt), is_leaf(l), left(lc), right(rc){};
TreeNode(): character(0), count(0), is_leaf(false), left(NULL), right(NULL){}
int getCount() const{
return this -> count;
};
char getChar() const{
return this -> character;
};
TreeNode* getLeftTree() const{
return this -> left;
};
TreeNode* getRightTree() const{
return this -> right;
};
void setLeftTree(TreeNode* n){
this -> left = n;
};
void setRightTree(TreeNode* n){
this -> right = n;
};
void setChar(char c){
this -> character = c;
};
bool isLeaf(){
return is_leaf;
}
void setLeaf(bool num){
this->is_leaf = num;
}
private:
char character;
int count;
bool is_leaf;
TreeNode* left;
TreeNode* right;
};
class HuffmanTree{
public:
HuffmanTree(std::map<char , int>&);
HuffmanTree(){
root = new TreeNode(0, 0, false, NULL, NULL);
};
~HuffmanTree();
TreeNode* getRoot() const{
return this -> root;
};
class NodeComparator {
public:
bool operator()(const TreeNode *const lhs, const TreeNode *const rhs) {
if (lhs->getCount() == rhs->getCount()) {
return lhs->getChar() > rhs->getChar();
}
return lhs->getCount() > rhs->getCount();
}
};
TreeNode* merge(TreeNode* node1, TreeNode* node2);
void recursiveNodeDelete(TreeNode* node);
// uint32_t check_count(uint32_t count);
private:
TreeNode* root;
};
#endif //HUFFMAN_HUFFMAN_H
huffman.cpp
#include "huffman.h"
#include <sstream>
#include <queue>
using namespace std;
HuffmanTree::HuffmanTree(std::map<char, int>& count_map) {
if (count_map.empty()) {
std::stringstream ss;
ss << "Compressor requires a non-empty text.";
throw std::runtime_error{ss.str()};
}
std::priority_queue<TreeNode*, std::vector<TreeNode*>, HuffmanTree::NodeComparator> queue;
for(auto a : count_map)
queue.push(new TreeNode(a.first, a.second, true, NULL, NULL));
while (queue.size() > 1) {
TreeNode* node1 = queue.top(); queue.pop();
TreeNode* node2 = queue.top(); queue.pop();
queue.push(merge(node1, node2));
}
root = queue.top(); queue.pop();
}
void HuffmanTree::recursiveNodeDelete(TreeNode* node) {
if (node == NULL) {
return;
}
recursiveNodeDelete(node->getLeftTree());
recursiveNodeDelete(node->getRightTree());
delete node;
}
HuffmanTree::~HuffmanTree() {
recursiveNodeDelete(root);
}
TreeNode* HuffmanTree::merge(TreeNode* node1, TreeNode* node2) {
TreeNode* new_node = new TreeNode(0, node1->getCount() + node2->getCount(), false, NULL, NULL);
if (node1->getCount() < node2->getCount()) {
new_node->setLeftTree(node1);
new_node->setRightTree(node2);
}
else {
new_node->setLeftTree(node2);
new_node->setRightTree(node1);
}
new_node->setChar(std::max(node1->getChar(), node2->getChar()));
return new_node;
}
bitstring.h
#ifndef HUFFMAN_BITSTRING_H
#define HUFFMAN_BITSTRING_H
#include <iostream>
#include <vector>
class BitStringWrite {
private:
char _byte;
int _pos;
std::ostream &_out_f;
public:
BitStringWrite(std::ostream &_out_f);
~BitStringWrite();
void writeBit(bool bit);
void writeByte(char b);
void flush();
};
class BitStringRead {
private:
char _byte;
int _pos;
std::istream &_in_f;
public:
BitStringRead(std::istream &_in_f);
char readByte();
bool readBit();
char getByte(){
return _byte;
}
};
#endif //HUFFMAN_BITSTRING_H
bitstring.cpp
#include "bitstring.h"
BitStringWrite::BitStringWrite(std::ostream &_out_f) : _byte(0), _pos(0), _out_f(_out_f) {}
void BitStringWrite::writeBit(bool bit) {
if (_pos == 8)
flush();
if (bit == 1) {
_byte |= (1 << _pos);
}
_pos++;
}
void BitStringWrite::writeByte(char b){
for(int i = 0; i < 8; i++)
writeBit((b >> i) & 1);
}
void BitStringWrite::flush() {
if (_pos != 0) {
_out_f.write(&_byte, sizeof(char));
_pos = 0;
_byte = 0;
}
}
BitStringRead::BitStringRead(std::istream &_in_f) : _pos(8), _in_f(_in_f) {}
bool BitStringRead::readBit() {
if (_pos == 8) {
_in_f.read(&_byte, sizeof(char));
_pos = 0;
}
return (_byte >> _pos++) & (char)1;
}
char BitStringRead::readByte() {
char sym = (char)0;
for (int i = 0; i < 8; i++){
sym |= ((1 & readBit()) << (i));
}
return sym;
}
BitStringWrite::~BitStringWrite() {
flush();
}
2 Answers 2
I don't have time to do a full write-up, but I can take an educated guess at why your file I/O is the bottleneck. You're literally reading and writing a single byte at a time:
_out_f.write(&_byte, sizeof(char));
and
_in_f.read(&_byte, sizeof(char));
You should buffer much more before writing to disk, and for reading you should read in more than you need, and pull from the buffer until it's empty. I recommend starting out with a 4096 byte buffer and profiling it to see if it's improved. You can try other sizes to see how it affects the performance.
IMO, anything that's performance related shouldn't use iostreams. Try to rewrite file io with regular C-stype fopen, fread, fclose. Try to write to your own buffer first (non ostringstream based) and then flush to file.
Also, there is a bug in BitStringWrite
: flushing shouldn't affect what you actually wrote to it:
BitStringWrite bw(ofs);
bw.writeBit(1);
bw.flush();
bw.writeBit(1);
bw.flush();
should produce the same output as:
BitStringWrite bw(ofs);
bw.writeBit(1);
bw.writeBit(1);
bw.flush();
Explore related questions
See similar questions with these tags.
auto
whenever possible. \$\endgroup\$std::bit_set
\$\endgroup\$std::bitset
has a fixed template size and my codes can be any length, so it is not suitable here. The thing is the reading/writing phase is really slow. Also I changedmap
forunordered_map
now it works faster, but it's still not enough. \$\endgroup\$