#!/usr/local/bin/node // Dictionary-compress an input text file. Shrinks the KJV bible by // about 61%, using a pure-printable-ASCII encoding, but takes about // 60 seconds to do it. Also, it may be vulnerable to input files // containing "magic words" like `__proto__`. // `compress` gets a 63% reduction on the same input file, but uses // all 256 byte values, rather than the 96 used by this program. // We're actually only getting a little over 5 bits per byte. So // without that handicap, we’d beat `compress`! In under 50 lines of // code. But only on files like that: large natural-language text // files. // I haven’t written the decompressor, so it’s possible that this // compression isn’t really lossless — that it actually corrupts data // irrecoverably rather than compressing it. // On smaller files and files that aren’t really text, it will tend to // do a lot worse, often even producing an output file that’s larger // than the input. If the dictionary itself were compressed using // some other compression algorithm, that problem would probably be // less common. // Oh, also, it reads the whole file into memory, and then uses // several times that amount of memory. It could be written to use // purely sequential access, but that will be really slow. var sys = require('sys') , fs = require('fs') , data = fs.readFileSync(process.ARGV[2], 'UTF-8') , words = data.split(/(\w+)/) , freq = {} , uniqWords = [] , wordIndexes = {} ; writeOut(words.length); writeOut("\n"); for (var ii = 0; ii < words.length; ii++) { var word = words[ii]; if (!(word in freq)) { uniqWords.push(word); freq[word] = 0; } freq[word]++; } if (' ' in freq) freq[' '] = 2; writeOut(uniqWords.length); writeOut("\n"); // Assign most frequent words to lowest indices to give them // one-letter codes. uniqWords.sort(function(a, b) { return freq[b] - freq[a]; }); for (var ii = 0; ii < uniqWords.length; ii++) { var word = uniqWords[ii]; writeOut(escape(word) + "\n"); wordIndexes[word] = ii; } for (var ii = 0; ii < words.length; ii++) { var word = words[ii]; // The decompressor can infer a space between two otherwise // unseparated \w+ words. if (word === ' ' && ii !== 0 && ii !== words.length - 1) continue; writeOut(encodeNumber(wordIndexes[word])); } writeOut("\n"); // The encoding scheme here is that we use all of the printable ASCII // characters, of which there are 95, plus newline, for a total of 96. // Half the characters we use (the second half) are termination // markers, so we are encoding in base 48. In essence, we allocate // one bit per digit as a termination marker. function encodeNumber(number, nonLastDigit) { var base = 48 , remainder = number % base , quotient = (number - remainder) / base , leadingDigits = quotient ? encodeNumber(quotient, true) : '' , digitIndex = nonLastDigit ? remainder : remainder + base , digit = digitIndex ? String.fromCharCode(31+digitIndex) : '\n'; ; return leadingDigits + digit; } function writeOut(data) { fs.writeSync(1, data); }