I wrote a general purpose file compression program out of curiosity, but I am concerned about readability. I don't know if the variable names are useful, and whether most of the comments are unneeded or not, so any advice/criticism is welcome.
You can check the repo, for more information.
const.h
#ifndef CONST_GUARD
#define CONST_GUARD
/* The total number of possible symbols,
meaning all possible 256 byte values and the
special EOF symbol, defined to make decompression easier. */
#define SYM_NUM 257
// Size of an integer in bits.
#define INT_SIZE sizeof(int)*8
#endif
codes.h
#ifndef CODE_GUARD
#define CODE_GUARD
#include "const.h"
#define MAX_CODELEN 12 // Max length of prefix codes
#define DECOMP_SIZE (1 << MAX_CODELEN) // Size of lookup array (2^max)
// prefixNodeT: Huffman tree node
struct huffmanNode {
// freq: The frequency of the symbol corresponding to the node
size_t freq;
struct huffmanNode *left;
struct huffmanNode *right;
/* symbol: For a leaf node, the symbol correspoding to the node,
for an inner node it isn't used. */
unsigned int symbol;
};
typedef struct huffmanNode huffmanNodeT;
/* compTableT: Lookup table used in compression,
contains the prefix code and value for a given symbol.
So, compression using the table is done by getting the
(val, len) code pair for every byte of the original file. */
typedef struct {
// Vals, lens are indexed by symbol numeric value
unsigned int vals[SYM_NUM];
int lens[SYM_NUM]; //in bits
} compTableT;
/* decompTableT: Lookup table used in decompression, as described in
https://commandlinefanatic.com/cgi-bin/showarticle.cgi?article=art007 */
typedef struct {
char codeLens[DECOMP_SIZE]; //in bits
int symbols[DECOMP_SIZE];
} decompTableT;
/* initCompressionTable(): Initialize the lookup table used in compression.
Input:
> compTablePtr: Pointer to the compression table
> freqs: Array containing the appearance frequency of all the symbols
Output:
>
>
Assumptions:
> compTablePtr != NULL
*/
int initCompressionTable(compTableT *compTablePtr, size_t freqs[SYM_NUM]);
/* initDecompressionTable(): Initialize the lookup table used in decompression.
Assumptions:
> decompTablePtr != NULL
*/
int initDecompressionTable(decompTableT *decompTablePtr, int codeLens[SYM_NUM]);
#endif
codes.c
#include <stdlib.h>
#include "const.h"
#include "minQueue.h"
#include "error.h"
#include "codes.h"
/* symbolT: Contains a symbol and the prefix code corresponding to it.
Needed to maintain the symbol -> (len,val) mapping after sorting. */
typedef struct {
unsigned int symbol;
unsigned int codeVal;
int codeLen;
} symbolT;
// initLeafNode(): Initalizes a leaf node of the Huffman tree.
huffmanNodeT *initLeafNode(unsigned int symbol, size_t freq) {
huffmanNodeT *leaf = malloc(sizeof(huffmanNodeT));
if (!leaf) {
reportError("malloc", __FILE__, __LINE__);
return NULL;
}
leaf->symbol = symbol;
leaf->left = leaf->right = NULL;
leaf->freq = freq;
return leaf;
}
/* initInnerNode: Initializes an inner node of the Huffman tree
(corresponds to a composite symbol). */
huffmanNodeT *initInnerNode(huffmanNodeT *left, huffmanNodeT *right) {
huffmanNodeT *node = malloc(sizeof(huffmanNodeT));
if (!node) {
reportError("malloc", __FILE__, __LINE__);
return NULL;
}
node->left = left;
node->right = right;
node->freq = left->freq + right->freq;
return node;
}
/* freeHuffmanTree(): Frees the Huffman tree.
Assumes that root != NULL. */
void freeHuffmanTree(huffmanNodeT *root) {
if (root->left != NULL)
freeHuffmanTree(root->left);
if (root->right != NULL)
freeHuffmanTree(root->right);
free(root);
}
/* initHuffmanTree(): Uses the textbook Huffman coding algorithm to initialize
the Huffman tree, using the frequencies of all the symbols. */
huffmanNodeT *initHuffmanTree(size_t freqs[SYM_NUM]) {
minQueueT minQueue;
huffmanNodeT **initialNodes;
huffmanNodeT *node1, *node2, *newNode, *huffmanRoot;
int nonZeroTotal = 0;
int k, t;
/* Initialize the array of the initial nodes, excluding the symbols with freq = 0
so that they will not waste heap operations. */
for (k = 0 ; k < SYM_NUM ; k++)
if (freqs[k] != 0)
nonZeroTotal++;
initialNodes = malloc(sizeof(huffmanNodeT*)*nonZeroTotal);
if (!initialNodes) {
reportError("malloc", __FILE__, __LINE__);
return NULL;
}
for (t = 0, k = 0 ; k < SYM_NUM ; k++) {
if (freqs[k]) {
initialNodes[t] = initLeafNode(k, freqs[k]);
if (!initialNodes[t]) {
reportError("malloc", __FILE__, __LINE__);
return NULL;
}
t++;
}
}
/* Use the array to initialize the min queue used in the algorithm,
no more than nonZeroTotal positions are needed in the min queue,
as in each step 2 symbols merge into 1, so #symbols =< nonZeroTotal. */
minQueueInit(&minQueue, initialNodes, nonZeroTotal);
while (minQueue.nodeTotal > 1) {
node1 = delMin(&minQueue);
node2 = delMin(&minQueue);
newNode = initInnerNode(node1, node2);
if (!newNode) {
reportError("malloc", __FILE__, __LINE__);
return NULL;
}
minQueueIns(&minQueue, newNode);
}
/* Remove the root from the min queue, before freeing the queue's array,
as the array is used in the minQueue, not a copy. */
huffmanRoot = delMin(&minQueue);
free(initialNodes);
return huffmanRoot;
}
/* compHuffmanLens(): Computes the Huffman code length correspoding to each
symbol by recursively traversing the symbol tree.
Assumptions:
> codeLens = 0 when the function is called outside itself.
> symbols[] is sorted in such a way that symbols[k].symbol == k, where k is a symbol. */
void compHuffmanLens(huffmanNodeT *huffmanNode, symbolT symbols[SYM_NUM], int codeLen) {
if (huffmanNode->left == NULL && huffmanNode->right == NULL) {
symbols[huffmanNode->symbol].codeLen = codeLen;
return;
}
if (huffmanNode->left != NULL)
compHuffmanLens(huffmanNode->left, symbols, codeLen+1);
if (huffmanNode->right != NULL)
compHuffmanLens(huffmanNode->right, symbols, codeLen+1);
}
// lenComp(): For use in qsort(), compares 2 symbols by code length.
int lenComp(const void *ptr1, const void *ptr2) {
symbolT sym1 = *((symbolT*)ptr1);
symbolT sym2 = *((symbolT*)ptr2);
return sym1.codeLen - sym2.codeLen;
}
/* lenThenLexComp(): For use in qsort(), first compares 2 symbols by code length, then
lexicographically (by their numeric values). */
int lenThenLexComp(const void *ptr1, const void *ptr2) {
symbolT sym1 = *((symbolT*)ptr1);
symbolT sym2 = *((symbolT*)ptr2);
int diff = sym1.codeLen - sym2.codeLen;
if (diff != 0)
return diff;
return sym1.symbol - sym2.symbol;
}
/* limitCodeLens(): Limits code lengths to MAX_CODELEN, using the heuristic algorithm
detailed here http://cbloomrants.blogspot.com/2010/07/07-03-10-length-limitted-huffman-codes.html,
in order for the decompression lookup table scheme to be used.
Assumptions:
> symbols[] is sorted by increasing codeLen.
*/
void limitCodeLens(symbolT symbols[SYM_NUM]) {
// kraftSum: The sum of 1 / 2^codeLen that appears in Kraft's inequality.
double kraftSum = 0;
int k;
/* 1. In the following loops, symbols with codeLen = 0 are ignored,
because they do not appear in the file to be compressed.
2. Casting to double is used to compute the Kraft inequality sum without
using pow() from math.h */
for (k = 0 ; k < SYM_NUM ; k++) {
if (symbols[k].codeLen) {
if (symbols[k].codeLen > MAX_CODELEN)
symbols[k].codeLen = MAX_CODELEN;
kraftSum += 1 / (double)(1 << symbols[k].codeLen);
}
}
for (k = SYM_NUM-1 ; k >= 0 ; k--)
while (symbols[k].codeLen && symbols[k].codeLen < MAX_CODELEN && kraftSum > 1 ) {
symbols[k].codeLen++;
kraftSum -= 1 / (double)(1 << symbols[k].codeLen);
}
for (k = 0 ; k < SYM_NUM ; k++)
while (symbols[k].codeLen && (kraftSum + 1 / (double)(1 << symbols[k].codeLen)) <= 1) {
kraftSum += 1 / (double)(1 << symbols[k].codeLen);
symbols[k].codeLen--;
}
}
/* computeCodeVals(): Compute prefix code values using the Canonical Huffman code
generation algorithm.
Assumptions:
> symbols[] is sorted by increasing codeLen and then lexicographically (this is needed in order
to enforce the same ordering of symbols with equal lengths during compression and decompression, so
that the same code values are produced).
*/
void computeCodeVals(symbolT symbols[SYM_NUM]) {
int prevLen, prevVal = 0;
int k;
for (k = 0 ; k < SYM_NUM ; k++)
if (symbols[k].codeLen) {
symbols[k].codeVal = 0;
prevLen = symbols[k].codeLen;
break;
}
for (k++ ; k < SYM_NUM ; k++)
if (symbols[k].codeLen) {
prevVal = symbols[k].codeVal = (prevVal + 1) << (symbols[k].codeLen - prevLen);
prevLen = symbols[k].codeLen;
}
}
/* initCompressionTable(): Initialize the lookup table used in compression.
Assumptions:
> compTablePtr != NULL
*/
int initCompressionTable(compTableT *compTablePtr, size_t freqs[SYM_NUM]) {
huffmanNodeT *huffmanRoot;
symbolT symbols[SYM_NUM];
int k;
huffmanRoot = initHuffmanTree(freqs);
if (!huffmanRoot)
return -1;
// Iterate over all of the symbols and init symbols[]
for (k = 0 ; k < SYM_NUM ; k++) {
symbols[k].symbol = k;
/* Not computed yet, codeLen init to 0 in order to indicate
a symbol that does not appear in the file to be compressed. */
symbols[k].codeLen = 0;
symbols[k].codeVal = 0;
}
compHuffmanLens(huffmanRoot, symbols, 0);
// The tree is not needed anymore
freeHuffmanTree(huffmanRoot);
// Sort symbol array by increasing code length, needed by limitCodeLens()
qsort(symbols, SYM_NUM, sizeof(symbolT), lenComp);
limitCodeLens(symbols);
qsort(symbols, SYM_NUM, sizeof(symbolT), lenThenLexComp);
computeCodeVals(symbols);
// Fill the compression table
for (k = 0 ; k < SYM_NUM ; k++) {
/* symbols[k].symbol is used as the index
and not k, because due to the above sorting
symbols[k].symbols != k in the general case. */
compTablePtr->lens[symbols[k].symbol] = symbols[k].codeLen;
compTablePtr->vals[symbols[k].symbol] = symbols[k].codeVal;
}
return 0;
}
/* initDecompressionTable(): Initialize the lookup table used in decompression.
Assumptions:
> decompTablePtr != NULL
*/
int initDecompressionTable(decompTableT *decompTablePtr, int codeLens[SYM_NUM]) {
symbolT symbols[SYM_NUM];
int k, j;
/*
leftVal: The codeVal of the currently checked symbol left shifted,
so that the codeLen least significant bits become the most
significant.
leftIdx: The current value of index left shifted,
so that the MAX_CODELEN least significant bits become the most
significant.
mask: AND mask of the form 111110....000, used to determine if
leftVal is a prefix of leftIdx.
*/
int leftVal, leftIdx, mask;
// curLen: Length of the current symbol checked
int curLen;
for (k = 0 ; k < SYM_NUM ; k++) {
symbols[k].symbol = k;
symbols[k].codeLen = codeLens[k];
symbols[k].codeVal = 0; // Not known yet
}
/* Sort symbol array by length and then lexicographically to produce the same codes values
that were used in compression. */
qsort(symbols, SYM_NUM, sizeof(symbolT), lenThenLexComp);
// Compute code vals using the same algo as in compression
computeCodeVals(symbols);
/* The table is filled as follows:
For every MAX_CODELEN-bit integer k, find the
prefix code C that is a prefix of k (in a prefix code, a code cannot be
prefixed by another, so only one code can be a prefix of k).
To find the prefix, check all of the codes (len, val pairs) in the inner loop,
The index k is left shifted so that the MAX_CODELEN LSBs become the MSBs,
and the value of the code being checked is left shifted so that the codeLen LSBs
become the MSBs. If k is AND masked by a mask of the form 111111111000..0,
\codeLen/
and masked(k) == the shifted value of the current code, C is a prefix of k.
Upon finding C, codeLens[k] = the length of C (in bits),
symbols[k] = the symbol encoded by C.
*/
for (k = 0 ; k < DECOMP_SIZE ; k++)
for (j = 0 ; j < SYM_NUM ; j++) {
curLen = symbols[j].codeLen;
if (curLen != 0) { // Ignore the symbols that do not appear in the original file
leftIdx = k << (INT_SIZE - MAX_CODELEN);
mask = ((1 << curLen) - 1) << (INT_SIZE - curLen);
leftVal = symbols[j].codeVal << (INT_SIZE - curLen);
if ((leftIdx & mask) == leftVal) {
decompTablePtr->codeLens[k] = curLen;
decompTablePtr->symbols[k] = symbols[j].symbol;
break;
}
}
}
return 0;
}
minQueue.h
#ifndef QUEUE_GUARD
#define QUEUE_GUARD
#include "codes.h"
// Min queue implementation used in Huffman coding (encoding.c).
// minQueueT: Binary min-heap implemented using arrays
typedef struct {
/* Array of pointers to Huffman tree nodes. */
huffmanNodeT **array;
/* Number of positions in the array that are used (due to how
a binary heap works they are the adjacent positions [0, nodeTotal-1] */
int nodeTotal;
} minQueueT;
/* minQueueInit(): Initialize the minimum queue, using an array of Huffman tree nodes.
Assumptions:
> minQueuePtr is not a NULL pointer.
> nodeTotal > 0
*/
void minQueueInit(minQueueT *minQueuePtr, huffmanNodeT **initialNodes, int nodeTotal);
/* minQueueIns(): Insert a Huffman tree root into the queue.
Assumptions:
> minQueuePtr is not a NULL pointer.
*/
void minQueueIns(minQueueT *minQueuePtr, huffmanNodeT *node);
/* delMin(): Remove the root of minimum frequency from the queue and return it.
Assumptions:
> minQueuePtr is not a NULL pointer.
*/
huffmanNodeT *delMin(minQueueT *minQueuePtr);
#endif
minQueue.c
#include <stdlib.h>
#include "codes.h"
#include "minQueue.h"
void swap(huffmanNodeT **array, int x, int y) {
huffmanNodeT *temp = array[x];
array[x] = array[y];
array[y] = temp;
}
/* siftDown(): Sinks a node down the heap until the min heap
property is satisfied, by swapping it with the child of
minimum frequency.
Assumptions:
> minQueuePtr != NULL
> 0 <= pos < nodeTotal
*/
void siftDown(minQueueT *minQueuePtr, int pos) {
int minChild = 2*pos + 1;
int nodeTotal = minQueuePtr->nodeTotal;
huffmanNodeT **array = minQueuePtr->array;
while (minChild < nodeTotal) {
if (minChild + 1 < nodeTotal && array[minChild+1]->freq < array[minChild]->freq)
minChild++;
if (array[pos]->freq > array[minChild]->freq) {
swap(array, pos, minChild);
pos = minChild;
minChild = 2*pos+1;
}
else
break;
}
}
void minQueueInit(minQueueT *minQueuePtr, huffmanNodeT **initialNodes, int nodeTotal) {
minQueuePtr->array = initialNodes;
minQueuePtr->nodeTotal = nodeTotal;
for (int pos = nodeTotal/2 - 1 ; pos >= 0 ; pos--)
siftDown(minQueuePtr, pos);
}
void minQueueIns(minQueueT *minQueuePtr, huffmanNodeT *node) {
int pos = minQueuePtr->nodeTotal++;
huffmanNodeT **array = minQueuePtr->array;
array[pos] = node;
while (pos > 0 && array[(pos-1)/2]->freq > node->freq) {
swap(array, pos, (pos-1)/2);
pos = (pos-1)/2;
}
}
huffmanNodeT *delMin(minQueueT *minQueuePtr) {
huffmanNodeT *min = minQueuePtr->array[0];
minQueuePtr->array[0] = minQueuePtr->array[--minQueuePtr->nodeTotal];
siftDown(minQueuePtr, 0);
return min;
}
file.h
#ifndef FILE_GUARD
#define FILE_GUARD
#include <stdio.h>
#include "const.h"
#include "codes.h"
/* The file format used for the compressed files is the
following:
> A header, which includes:
1. The magic number "FG2019" in ASCII.
2. The number of compressed data bytes (sizeof(size_t))
3. The code lengths of all 256 possible byte values and of the EOF symbol,
with values that do not appear in the original file being indicated
with a length of 0, which are needed to generate the same codes
as in compression, so that the data will be able to be decoded.
> The compressed data bytes (their number is stored in the header),
which always include the EOF symbol (of numeric value EOF_VAL) as
the last encoded symbol to aid in decompression.
Some bits of the last byte might be "padding" bits, used to fill
the last byte when the compression data size is not divisible by 8
(so the last bits do not fill a whole byte). Once the EOF symbol is
decoded, no more bits are to be processed, so those "padding" bits are igneored.
*/
/* isEmpty(): Checks if the file is empty. */
int isEmpty(FILE *fptr);
/* countSyms(): Return the frequencies of all symbols (byte or EOF)
in the file pointed to by src.
Assumptions:
> src != NULL
*/
int countSyms(FILE *src, size_t freqs[SYM_NUM]);
/* compress(): Compresses the file pointed to by src,
and writes the compressed data to dest.
Assumptions:
> All arguements != NULL
*/
int compress(FILE *src, FILE *dest, compTableT *compTablePtr);
/* compress(): Decompresses the compressed file pointed to by src
and writes the decompressed data to dest.
Assumptions:
> All arguements != NULL
*/
int decompress(FILE *src, FILE *dest, decompTableT decompTable, size_t compSize);
/* writeHeader(): Writes information necessary for decompression
(the compression header) to the file pointed to by dest. */
int writeHeader(FILE *dest, compTableT *compTablePtr, size_t freqs[SYM_NUM]);
/* readHeader(): Reads the compression header, and stores the
necessary information (code lengths and compressed data size). */
int readHeader(FILE *src, int codeLens[SYM_NUM], size_t *compSizePtr);
#endif
file.c
#include <stdio.h>
#include <string.h>
#include "const.h"
#include "codes.h"
#include "error.h"
#include "file.h"
// Magic number and length
#define MAGIC_NUM "FG2019"
#define MAGIC_LEN 6
/* AND mask used in decoding, with the
x least significant bits being 1. */
#define MASK(x) (x >= INT_SIZE ? 0xFFFFFFFF : (1 << x) - 1)
// Size (in bytes) of the various buffers used
#define BUF_SIZE 1024
/* Shift amount used to make the MAX_CODELEN LSBs of the integer
the MAX_CODELEN MSBs by left shifting, used in decoding. */
#define LOOKUP_SHIFT (INT_SIZE - MAX_CODELEN)
/* The numerical value of the EOF symbol, used to simplify the
decoding process, as once an EOF symbol is decoded, no more
bits of the compressed file contain data of the original and
decoding can stop. */
#define EOF_VAL 256
int isEmpty(FILE *fptr) {
char c;
/* If the end-of-file is reached upon trying to read one character,
then the file is empty. */
c = fgetc(fptr);
if (c == EOF) {
if (feof(fptr))
fprintf(stderr, "The file is empty!\n");
else
reportError("fgetc", __FILE__, __LINE__);
return 1;
}
// Put the character back into the stream.
c = ungetc(c, fptr);
if (c == EOF) {
reportError("ungetc", __FILE__, __LINE__);
return 1;
}
return 0;
}
int countSyms(FILE *src, size_t freqs[SYM_NUM]) {
unsigned char buffer[BUF_SIZE];
size_t bytesRead;
int k;
bzero(freqs, SYM_NUM*sizeof(size_t));
/* Read the data in chunks of BUF_SIZE bytes until the end-of-file, to perform less read operations,
and count the appearances of each symbol. */
do {
bytesRead = fread(buffer, 1, BUF_SIZE, src);
if (ferror(src)) {
reportError("safeRead", __FILE__, __LINE__);
return -1;
}
for (k = 0 ; k < bytesRead ; k++)
freqs[buffer[k]] += 1;
} while (!feof(src));
/* Add one appearance for the special EOF symbol (not to be confused with the actual end-of-file)
that is used in decoding. */
freqs[EOF_VAL] = 1;
return 0;
}
int compress(FILE *src, FILE *dest, compTableT *compTablePtr) {
/*
readBuf[]: Buffer used to read BUF_SIZE from src at once, an optimization
to limit fread() function calls (1 call for BUF_SIZE bytes is cheaper
than BUF_SIZE calls to read 1 byte).
writeBuf[]: Used for similar reasons to readBuf[], only for
fwrite() function calls. Must be initialized to 0,
so that OR operations involving it give the wanted results.
*/
unsigned char readBuf[BUF_SIZE], writeBuf[BUF_SIZE] = {0};
/* bitsRemCode: The bits of the code corresponding to the current
byte of the read buffer being encoded, that need to be written to the
write buffer. */
int bitsRemCode;
/* bitsRemByte: The bits remaining in the byte of the write buffer
that is currently used. Initialized to 8, as the first byte
will initially have all of its 8bits unused. */
char bitsRemByte = 8;
// wPos: The position of the current byte in writeBuf.
int wPos = 0;
int k;
size_t bytesRead, bytesWritten;
int *codeLens = compTablePtr->lens;
unsigned int *codeVals = compTablePtr->vals;
// curVal: The current code value being written to the write buffer.
unsigned int curVal;
do {
bytesRead = fread(readBuf, 1, BUF_SIZE, src);
if (ferror(src)) {
reportError("safeRead", __FILE__, __LINE__);
return -1;
}
for (k = 0 ; k < bytesRead ; k++) {
bitsRemCode = codeLens[readBuf[k]];
curVal = codeVals[readBuf[k]];
/* While the code bits that haven't been written yet do
not fit in the current byte of the write buffer,
write as many bits as possible to fit in the byte, then
move on to the next byte of the buffer, or in the
case that the buffer is full, flush it and then begin from wPos = 0. */
while (bitsRemCode > bitsRemByte) {
writeBuf[wPos] |= curVal >> (bitsRemCode - bitsRemByte);
wPos++;
if (wPos == BUF_SIZE) {
bytesWritten = fwrite(writeBuf, 1, BUF_SIZE, dest);
if (bytesWritten < BUF_SIZE) {
reportError("fwrite", __FILE__, __LINE__);
return -1;
}
// Clear the write buffer, else OR operations will produce garbage
bzero(writeBuf, BUF_SIZE);
wPos = 0;
}
/* bitsRemByte bits of the code were written to the buffer,
so bitsRemCode - bitsRemByte remain */
bitsRemCode -= bitsRemByte;
// A new byte will be used, so all of its 8 bits are free
bitsRemByte = 8;
}
// Write the reamining bits of the code to the current byte
writeBuf[wPos] |= curVal << (bitsRemByte - bitsRemCode);
bitsRemByte -= bitsRemCode;
}
} while (!feof(src));
// Another loop to write the encoded EOF symbol to the file.
curVal = codeVals[EOF_VAL];
bitsRemCode = codeLens[EOF_VAL];
while (bitsRemCode > bitsRemByte) {
writeBuf[wPos] |= curVal >> (bitsRemCode - bitsRemByte);
wPos++;
if (wPos == BUF_SIZE) {
bytesWritten = fwrite(writeBuf, 1, BUF_SIZE, dest);
if (bytesWritten < BUF_SIZE) {
reportError("fwrite", __FILE__, __LINE__);
return -1;
}
bzero(writeBuf, BUF_SIZE);
wPos = 0;
}
bitsRemCode -= bitsRemByte;
bitsRemByte = 8;
}
writeBuf[wPos] |= curVal << (bitsRemByte - bitsRemCode);
bitsRemByte -= bitsRemCode;
/* If wPos > 0 after the above while loop ends,
some compressed data is still in the
write buffer. In that case a last write to the
file should be performed. */
bytesWritten = fwrite(writeBuf, 1, wPos+1, dest);
if (bytesWritten < wPos+1) {
reportError("fwrite", __FILE__, __LINE__);
return -1;
}
return 0;
}
int writeHeader(FILE *dest, compTableT *compTablePtr, size_t freqs[SYM_NUM]) {
// compSize: Size of the compressed data in bytes.
size_t compSize = 0;
size_t written;
// lenBuf: Buffer that stores the codeLens to be written.
unsigned char lenBuf[SYM_NUM];
written = fwrite(MAGIC_NUM, 1, MAGIC_LEN, dest);
if (written < MAGIC_LEN) {
reportError("fwrite", __FILE__, __LINE__);
return -1;
}
// Compute compSize in bits.
for (int k = 0 ; k < SYM_NUM ; k++)
compSize += freqs[k] * compTablePtr->lens[k];
// Convert compSize to byte size by ceil.
compSize = (compSize % 8 == 0) ? (compSize / 8) : (compSize / 8 + 1);
/* Write size to file (for error written != 0 is checked,
due to writing only 1 item of size sizeof(size_t)). */
written = fwrite(&compSize, sizeof(size_t), 1, dest);
if (written == 0) {
reportError("fwrite", __FILE__, __LINE__);
return -1;
}
/* Write the code lengths for every symbol (EOF included) to
the file (even the 0 lengths for the symbols that do not appear).
There is no need to provide further information, the codeVals
can be computed using the codeLens using the canonical Huffman algorithm
in code.c. */
for (int k = 0 ; k < SYM_NUM ; k++)
lenBuf[k] = compTablePtr->lens[k];
written = fwrite(lenBuf, 1, SYM_NUM, dest);
if (written < SYM_NUM) {
reportError("fwrite", __FILE__, __LINE__);
return -1;
}
return 0;
}
int readHeader(FILE *src, int codeLens[SYM_NUM], size_t *compSizePtr) {
// compSize: Size of the compressed data in bytes.
size_t compSize;
/* readBuf: Buffer used for reading from the file, both the magic number,
and the codelengths of the SYM_NUM symbols fit into it. */
char readBuf[SYM_NUM] = {0};
// Read MAGIC_LEN bytes, if there are not enough, the file does not adhere to the format.
fread(readBuf, 1, MAGIC_LEN, src);
if (feof(src)) {
fprintf(stderr, "%s:%d: Malformed magic num error.\n", __FILE__, __LINE__);
return -1;
}
else if (ferror(src)) {
reportError("safeRead", __FILE__, __LINE__);
return -1;
}
if (strcmp(readBuf, MAGIC_NUM) != 0) {
fprintf(stderr, "Magic number missing!\n");
return -1;
}
// Read compSize from file.
fread(&compSize, sizeof(size_t), 1, src);
if (feof(src)) {// Again if not enough bits, the header is malformed
fprintf(stderr, "%s:%d: Malformed header error.\n", __FILE__, __LINE__);
return -1;
}
else if (ferror(src)) {
reportError("safeRead", __FILE__, __LINE__);
return -1;
}
*compSizePtr = compSize;
// Read all of the code lengths.
fread(readBuf, 1, SYM_NUM, src);
if (feof(src)) {
fprintf(stderr, "%s:%d: Malformed header error.\n", __FILE__, __LINE__);
return -1;
}
else if (ferror(src)) {
reportError("safeRead", __FILE__, __LINE__);
return -1;
}
for (int k = 0 ; k < SYM_NUM ; k++)
codeLens[k] = readBuf[k];
return 0;
}
int decompress(FILE *src, FILE *dest, decompTableT decompTable, size_t compSize) {
// readBuf, writeBuf: Used for the same reason as in compress()
unsigned char readBuf[BUF_SIZE], writeBuf[BUF_SIZE];
// Bits remaining in the current read buffer byte
char bitsRem = 8;
// wPos: write buffer position, rPos: read buffer positions
int wPos = 0, rPos;
size_t bytesRead, bytesWritten;
int readLoops = (compSize % BUF_SIZE == 0) ? (compSize / BUF_SIZE) : (compSize / BUF_SIZE + 1);
/* decIdx: A 32bit buffer, the MAX_CODELEN most significant bits of
which are used as the index of the decoding lookup table, by doing decIdx >> LOOKUP_SHIFT
Initialized to 0, in order for the first bit write using OR operations to work correctly.
*/
unsigned int decIdx = 0;
// Bits needed until all 32bits of decIdx are filled.
int bitsNeeded = INT_SIZE;
/* totalBytes: The total number of bytes that have been read, used to detect
if the compressed data is less than that indicated by compSize. */
size_t totalBytes = 0;
for (int k = 0 ; k < readLoops ; k++) {
// (Re)fill read buffer
bytesRead = fread(readBuf, 1, BUF_SIZE, src);
if (ferror(src)) {
reportError("safeRead", __FILE__, __LINE__);
return -1;
}
/* If the end-of-file (not to be confused with the EOF symbol) is reached,
and the total number of bytes read is less than compSize, then some
bytes are missing. Else, it is just the last loop of filling the read buffer. */
else if (feof(src) && (totalBytes + bytesRead < compSize)) {
fprintf(stderr, "%s:%d: Malformed file error, less data than promised.\n", __FILE__, __LINE__);
return -1;
}
totalBytes += bytesRead;
// Reset read buffer position
rPos = 0;
/* The decoding table decIdx is formed using the bits read from the file,
and should always contain sizeof(int)*8 bits. The number of bits
is arbitrary, as long as it is greater than MAX_CODELEN (so that
there won't be a need to find more bits during decoding). The requirement
for the number of bits to remain constant is to simplify the
algorithm a bit, with decIdx mimicking a bit stream.
Initially, decIdx is empty (bitsNeeded == INT_SIZE),
so it is filled with bits from the read buffer.
When the x most significant bits of decIdx are used to
decode a symbol (which is written to the write buffer), they are
consumed (removed from decIdx through left shifting by x).
Those x bits are then replenished by bits of the read buffer (as
mention, decIdx always contains sizeof(int)*8 bits), and
if the read buffer is out of bits, the while(1){} loop is
exited, and the read buffer is refilled.
*/
while (1) {
/* The approach to writing bits to decIdx is similar to that used
in compress() for writing bits to writeBuf[wPos], but a mask is
needed because decIdx is not 8bits long (unlike writeBuf[wPos]),
so if bits other than the bitsNeeded LSBs are != 0, the value of
decIdx will be wrong. */
while (bitsNeeded > bitsRem && rPos < bytesRead) {
// Left shift promotes unsigned char to int
decIdx |= ((readBuf[rPos] << (bitsNeeded - bitsRem)) & MASK(bitsNeeded));
rPos++;
bitsNeeded -= bitsRem;
bitsRem = 8;
}
if (rPos == bytesRead)
break;
decIdx |= ((readBuf[rPos] >> (bitsRem - bitsNeeded)) & MASK(bitsNeeded));
bitsRem -= bitsNeeded;
writeBuf[wPos] = decompTable.symbols[decIdx >> LOOKUP_SHIFT];
wPos++;
// If the write buffer is full, flush and reset position
if (wPos == BUF_SIZE){
bytesWritten = fwrite(writeBuf, 1, BUF_SIZE, dest);
if (bytesWritten < BUF_SIZE) {
reportError("fwrite", __FILE__, __LINE__);
return -1;
}
wPos = 0;
}
bitsNeeded = decompTable.codeLens[decIdx >> LOOKUP_SHIFT];
decIdx <<= bitsNeeded;
}
}
/* After all of the compressed file's bits have been read,
bits stored in decIdx cannot be replenished, so the bits
remaining in decIdx are decoded into the symbols of t
he original file, until the EOF symbol is found.
Assuming the compressed file is not corrupted, the EOF
symbol should always be found. */
// Check if EOF was found using its numeric value.
while (decompTable.symbols[decIdx >> LOOKUP_SHIFT] != EOF_VAL) {
writeBuf[wPos] = decompTable.symbols[decIdx >> LOOKUP_SHIFT];
wPos++;
if (wPos == BUF_SIZE){
bytesWritten = fwrite(writeBuf, 1, BUF_SIZE, dest);
if (bytesWritten < BUF_SIZE) {
reportError("fwrite", __FILE__, __LINE__);
return -1;
}
wPos = 0;
}
// Consume the bits of decIdx needed for decoding.
bitsNeeded = decompTable.codeLens[decIdx >> LOOKUP_SHIFT];
decIdx <<= bitsNeeded;
}
/* If wPos > 0 after the above while loop ends,
some data is still in the
write buffer. In that case a last write to the
file should be performed. */
if (wPos > 0) {
bytesWritten = fwrite(writeBuf, 1, wPos, dest);
if (bytesWritten < wPos) {
reportError("fwrite", __FILE__, __LINE__);
return -1;
}
}
return 0;
}
error.h
#ifndef ERROR_GUARD
#define ERROR_GUARD
void reportError(char *message, char *filename, int line);
#endif
error.c
#include <stdio.h>
#include <errno.h>
#include <stdlib.h>
void reportError(char *message, char *filename, int line) {
fprintf(stderr, "%s:%d: ", filename, line);
perror(message);
}
```
3 Answers 3
Write inline documentation in Doxygen format
You are making a good effort to document the functions, variables and macros you declare. It would be even better if you use a standardized format for it, like Doxygen. This can then be processed by tools to provide that documentation in HTML and PDF format, and those tools can even check if you describe all input and output parameters of functions, and whether you forgot to document anything.
In addition to documenting assumptions, assert() them
If a function requires that a parameter that's passed in is non-NULL, add an assert() statement at the start of that function to check that. You can disable the
assert()-statements when making a release build by adding -DNDEBUG to the compiler flags. But during development, this helps debugging issues.
Avoid macros where possible
Don't #define anything you could just as well have written as a const variable or as a function. The problem with macros is that it is easy to forget necessary parentheses, or to evaluate macro arguments with side-effects multiple times. Constants and inlined functions are just as performant as macros. Also, you get stronger typing. So for example, instead of:
#define MASK(x) (x >= INT_SIZE ? 0xFFFFFFFF : (1 << x) - 1)
Write:
static inline int mask(int x) {
if (x >= INT_SIZE)
return 0xFFFFFFFF;
else
return (1 << x) - 1;
}
Note that the above already hints at an issue: the constant 0xFFFFFFFF is larger than the maximum value of an int. Maybe this should return an unsigned int? And while you are at it, you should probably just assert(x < INT_SIZE).
Make functions static where possible
If you have a function that is only used in the file it is defined in, you should make it static. This is especially important for libraries, as it prevents these function names leaking into the global namespace. Also, it allows the compiler to make more aggressive optimizations.
Use a code formatter
There are some inconsistencies in your code with spaces around braces and math operators, there's mixed /* ... */ and // ... comment styles, newlines around code blocks, and so on. Use a code formatter like indent, astyle or clang-format to ensure everything has a consistent style. This also makes it easier if you have other people contributing to your code.
Remove unnecessary parentheses and casts
This is rather personal, but I believe that adding unnecessary parentheses to expressions more often makes them less readable than that it improves readability. For example:
symbolT sym1 = *((symbolT*)ptr1);
This can be written as:
symbolT sym1 = *(symbolT*)ptr1;
Also, you can avoid explicit casts to double in some cases. For example:
kraftSum += 1 / (double)(1 << symbols[k].codeLen);
Can be rewritten as:
kraftSum += 1.0 / (1 << symbols[k].codeLen);
The 1.0 makes it explicit to the compiler that you want to do a floating point division.
Don't indent inside header guards
Don't indent code inside #ifndef FOO_GUARD ... #endif. This would indent all but two lines in a file. This doesn't really improve readability, it just shifts everything to the right, resulting in shorter lines and useless whitespace at the left.
Alternatively to the header guard style you are using, you can also just put #pragma once at the start of each header file. All major compilers for all major platforms support this.
Consider delaying I/O error checking
When you are writing to the output, you check the return value of fwrite() every time. It is good practice to do error checking for all functions that might return an error. However, you will have noticed it adds a lot of lines to the code. Wouldn't it be nice if that can be avoided? It would save typing and makes the code look nicer. With most functions operating on a FILE *, the C library actually keeps track of the error state for each file. You can check whether the stream is still OK at any time by calling ferror(). The chance of an error actually occuring is low, and if it does, then delaying the error message a bit is not a problem. So in this case, you can just call fwrite() without checking the results, and just check ferror() once at the end. If ferror() returns a non-zero value, print an error and return an error.
-
2\$\begingroup\$ @Hashew One problem is the one he said: naming collisions: you can't write two functions with the same name if they aren't
static. If you do, even if they are in different files, the linker won't know what to do, because it will see two functions with the same name. \$\endgroup\$alx - recommends codidact– alx - recommends codidact2019年08月10日 17:27:52 +00:00Commented Aug 10, 2019 at 17:27 -
1\$\begingroup\$ @G.Sliepen I would advise against
static inlinein favour ofinline. The reason is basically that static inline may produce more code. Here's a link where I explained with more detail: codereview.stackexchange.com/a/223755/200418 \$\endgroup\$alx - recommends codidact– alx - recommends codidact2019年08月10日 17:37:39 +00:00Commented Aug 10, 2019 at 17:37 -
1\$\begingroup\$ About macros I would say they are great for 3 things: -I mostly use macros instead of
static constbecause you can use them to initializestaticvariables, and I always write parentheses around them just to be safe, even if I don't need them. -Function-like macros that can't be written as functions:#define ARRAY_SIZE(a) (sizeof(a) / sizeof((a)[0])). -Generic functions:MAX(a, b);It's something very simple, and is relatively safe with a conscious user that knows it is a macro. But yes, function-like macros that can be easily replaced byinlinefunctions should always be replaced. \$\endgroup\$alx - recommends codidact– alx - recommends codidact2019年08月10日 17:55:36 +00:00Commented Aug 10, 2019 at 17:55 -
1\$\begingroup\$ @G.Sliepen Initializers: although some compilers may allow that as an extension, that is not necessarily true: stackoverflow.com/a/3025106/6872717 \$\endgroup\$alx - recommends codidact– alx - recommends codidact2019年08月11日 15:23:51 +00:00Commented Aug 11, 2019 at 15:23
-
1\$\begingroup\$ @G.Sliepen About functions replacing macros: If the function is in a header, it is in global namespace anyway, so
staticwill only produce bad consequences. If the function is in a source file, then the compiler will know when to inline, soinlinewill be useless 99.9% of the time. Basically,inlinein headers, andstaticin sources. \$\endgroup\$alx - recommends codidact– alx - recommends codidact2019年08月11日 15:27:25 +00:00Commented Aug 11, 2019 at 15:27
CHAR_BIT
Use that macro when you write 8 meaning how many bits there are in a char.
You can find it in <limits.h>
Safe (future-proof) usage of sizeof
sizeof(type)vssizeof(*foo):
foo = malloc(sizeof(*foo)); is better because if you ever change the type of foo, this call will still be valid, while if not, you would have to change every line where malloc is called with foo. If you forget any of those lines, good luck.
In your qsort calls happens the same.
Header naming collision
The header file <error.h> exists in GNU C. If you never intend to support GCC or any compiler that is compatible with GNU C (which is a lot of them), then fine. You should probably rename your header file. Using a path, such as #include "my_project/error.h" is the best solution I can think of, because you never know what weird names they might have invented as extensions to standard C; they might even invent them later than you. Yes, in "" included files, your files have precedence over the installed ones, but I wouldn't rely much on that.
From the top of my head I can't remember, but I would say that <file.h> also exists somewhere; just do the same to be safe.
Bitwise operations on signed integers
Unless you really need to do that, and you're completely sure that what you're doing is correct, don't. It's very very unsafe and unreliable. And even if you need to use Bitwise operations on signed integers, I would cast them to unsigned just to do the operation, and then cast back to the signed type to continue using it.
Also, if you can, use fixed width integers for even more safety (<stdint.h>)
Names and Order of Includes (source: Google C++ Style Guide)
Use standard order for readability and to avoid hidden dependencies: Related header, C library, C++ library, other libraries' .h, your project's .h.
All of a project's header files should be listed as descendants of the project's source directory without use of UNIX directory shortcuts . (the current directory) or .. (the parent directory). For example, google-awesome-project/src/base/logging.h should be included as:
#include "base/logging.h"
In dir/foo.cc or dir/foo_test.cc, whose main purpose is to implement or test the stuff in dir2/foo2.h, order your includes as follows:
dir2/foo2.h.
A blank line
C system files.
C++ system files.
A blank line
Other libraries' .h files.
Your project's .h files.
Note that any adjacent blank lines should be collapsed.
With the preferred ordering, if dir2/foo2.h omits any necessary includes, the build of dir/foo.cc or dir/foo_test.cc will break. Thus, this rule ensures that build breaks show up first for the people working on these files, not for innocent people in other packages.
dir/foo.cc and dir2/foo2.h are usually in the same directory (e.g. base/basictypes_test.cc and base/basictypes.h), but may sometimes be in different directories too.
Within each section the includes should be ordered alphabetically.
- In your case this would mean this order of includes:
minQueue.c:
#include "my_project/minQueue.h"
#include <stdlib.h>
#include "my_project/codes.h"
BUFSIZ
If you don't have the need of a very specific buffer size, don't write your own value, and just use this macro from <stdio.h>.
bzero is deprecated
CONFORMING TO
The
bzero()function is deprecated (marked as LEGACY in POSIX.1-2001); use memset(3) in new programs. POSIX.1-2008 removes the specification ofbzero(). Thebzero()function first appeared in 4.3BSD.The
explicit_bzero()function is a nonstandard extension that is also present on some of the BSDs. Some other implementations have a similar function, such asmemset_explicit()ormemset_s().
Use memset() instead.
strncmp
When you use fread on a buffer, you can't guarantee that it will contain a valid string, so I would use strncmp() instead of strcmp() just to be safe.
DRY
Don't repeat yourself: In every call to reportError() you repeat the file and line special identifiers. You could embed those in a macro. This is what I do:
/*
* void alx_perror(const char *restrict str);
*/
#define alx_perror(str) do \
{ \
alx_perror__(__FILE__, __LINE__, __func__, str); \
} while (0)
__attribute__((nonnull(1, 3)))
inline
void alx_perror__ (const char *restrict file, int line,
const char *restrict func, const char *restrict str);
inline
void alx_perror__ (const char *restrict file, int line,
const char *restrict func, const char *restrict str)
{
fprintf(stderr, "%s:\n", program_invocation_name);
fprintf(stderr, " %s:%i:\n", file, line);
fprintf(stderr, " %s():\n", func);
if (str)
fprintf(stderr, " %s\n", str);
fprintf(stderr, " E%i - %s\n", errno, strerror(errno));
}
ARRAY_SIZE()
When calling qsort() (actually, any function that accepts an array), in the field that states the size of the array, there are different possibilities:
- Pass the actual value
- Pass the result of
ARRAY_SIZE(arr)(defined typically as#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0])))
If you can use the second one, which is when you have a global static-duration array or an array local to the function, it's the best method, because if the size of the array is changed (for example if I had int a[FOO]; and then I decide to use a different size such as int a[BAR];), I don't need to change the rest of the code. And with recent compilers, such as GCC 8, you will receive a warning if you apply that to something that is not an array, so it is safe. With old compilers, there are still tricks to make this macro safe (you can find them in StackOverflow easily).
If you can't use it, just use the value. (Never use sizeof directly, which would give the size of a pointer!).
In your call to qsort(), it would be like this:
qsort(symbols, ARRAY_SIZE(symbols), sizeof(symbols[0]), lenThenLexComp);
The decompression table construction algorithm is unusual and not an improvement over the usual algorithm. To sum up the difference,
- This algorithm: for each "decode pattern", find the code with matching prefix.
- The common algorithm: for each code, append all possible padding bit-patterns.
The common algorithm has no "search" element to it, every iteration of the inner loop fills an entry of the decoding table. Using the bit-order that you use, the entries generated from a given code inhabit a contiguous range in the decoding table, starting at the code padded on the right with zeroes and ending at the code padded on the right with ones. It does not rely on any particular order of the symbols array though.
As a bonus, the code for this is bit simpler than what you have now. For example (not tested):
for (j = 0 ; j < SYM_NUM ; j++) {
curLen = symbols[j].codeLen;
unsigned int curCode = symbols[j].codeVal;
int curSym = symbols[j].symbol;
// The code for the current symbol corresponds to all decoding
// patterns that have it as a prefix.
// Extend the code up to the length of a decoding pattern in all
// possible ways, starting from all zeroes and stopping after all ones.
int padding = MAX_CODELEN - curLen;
unsigned int start = curCode << padding;
unsigned int end = (curCode + 1) << padding;
for (k = start ; k < end ; k++) {
decompTablePtr->codeLens[k] = curLen;
decompTablePtr->symbols[k] = curSym;
}
}
huffmanNodeT. \$\endgroup\$"const.h","minQueue.h","error.h","file.h"and"codes.h"? They seem to be missing from the review. \$\endgroup\$