Pronounceable binary-to-text encoding

Question 1

This is a binary-to-text encoder that is based on Base64, but it works by converting data into an almost pronounceable string (consonant-vowel-consonant-vowel for each 3 bytes). Any improvements, suggestions, etc. are welcome.

bitspeak.hpp

#ifndef BITSPEAK_HPP
#define BITSPEAK_HPP
/*
Encoding:
 ******** ******** ********
 CCCCCCCV VVVVCCCC CCCVVVVV
-CCCCCCC ---VVVVV -CCCCCCC ---VVVVV
C: Consonant
V: Vowel
Decoding:
-******* ---***** -******* ---*****
-xxxxxxx ---xyyyy -yyyyzzz ---zzzzz
 xxxxxxxx yyyyyyyy zzzzzzzz
*/
#include <iostream>
#include <cstdlib>
#include <cstdint>
#include <cstring>
#include <string>
#include <sstream>
#include <vector>
namespace bitspeak
{
static const std::string vows[32] = {"a","e","i","o","u","w","y","ae","ai","ao","au","aw","ay","ea","ee","ei","eo","eu","ey","ia","io","iu","oa","oi","ou","ow","oy","ui","uo","uy","we","wi"};
static const std::string cons[128] = {"b","c","d","f","g","h","j","k","l","m","n","p","q","r","s","t","v","x","z","bc","bl","br","bs","ch","chs","cht","ck","cl","cr","cs","ct","fl","fr","fs","ft","gl","gm","gr","gs","gz","hd","hf","hr","hs","ht","hv","jr","js","jt","kl","kr","kt","lb","lc","ld","lg","lm","ln","lp","ls","lt","lv","mb","mn","mp","mr","nd","nk","nm","np","ns","nt","nv","nz","pl","pr","ps","pt","pz","rb","rc","rch","rd","rf","rg","rk","rl","rm","rn","rp","rr","rs","rt","rv","rz","sb","sc","sch","sd","sf","sh","sl","sm","sn","sp","sq","sr","st","sth","str","sv","sz","tch","th","tl","tn","tr","ts","vl","vr","vs","vz","xc","xz","zd","zl","zp","zt"};
static const std::string vow_l = "aeiouwy";
static const std::string con_l = "bcdfghjklmnpqrstvxz";
std::vector<std::string> split(const char* str,char delim)
{
 std::stringstream ss;
 std::string str1 = str;
 std::vector<std::string> splitted;
 std::string rfs;
 while (str1[0] == delim) { str1.erase(0,1); }
 int se = str1.length();
 while (str1[se-1] == delim) { str1.erase(se-1,se); se = str1.length(); }
 for (int i=0;i<str1.length()-1;i++)
 {
 if (str1[i] == delim && str1[i+1] == delim)
 continue;
 else
 rfs += str1[i];
 }
 rfs += str1[str1.length()-1];
 for (int i=0;i<rfs.length()+1;i++)
 {
 if (rfs[i] != delim && i != rfs.length())
 ss << rfs[i];
 else
 {
 splitted.push_back(ss.str());
 ss.str(std::string());
 }
 }
 return splitted;
}
void ca3_to_ca4(uint8_t _a3[3],uint8_t _a4[4])
{
 _a4[0] = (_a3[0]&0xFE) >> 1;
 _a4[1] = ((_a3[0]&0x01) << 4) | ((_a3[1]&0xF0) >> 4);
 _a4[2] = ((_a3[1]&0x0F) << 3) | ((_a3[2]&0xE0) >> 5);
 _a4[3] = _a3[2]&0x1F;
}
void ca4_to_ca3(uint8_t _a4[4],uint8_t _a3[3])
{
 _a3[0] = ((_a4[0]&0x7F) << 1) | ((_a4[1]&0x10) >> 4);
 _a3[1] = ((_a4[1]&0x0F) << 4) | ((_a4[2]&0x78) >> 3);
 _a3[2] = ((_a4[2]&0x07) << 5) | (_a4[3]&0x1F);
}
bool is_vowel(char x) { bool o = false; int i; for (i=0;i<vow_l.size();i++) { if (x == vow_l[i]) { o = true; break; } } return o; }
bool is_consonant(char x) { bool o = false; int i; for (i=0;i<con_l.size();i++) { if (x == con_l[i]) { o = true; break; } } return o; }
uint32_t string_find(const std::string *arr,std::string x,uint32_t size)
{
 uint32_t idx = -1, i = 0;
 for (i=0;i<size;i++)
 {
 if (x == arr[i])
 {
 idx = i;
 break;
 }
 }
 return idx;
}
void encode(uint8_t *data,uint32_t length,std::string& out)
{
 uint32_t i = 0, j = 0;
 uint8_t ca3[3];
 uint8_t ca4[4];
 while (length--)
 {
 ca3[i++] = *(data++);
 if (i == 3)
 {
 ca3_to_ca4(ca3,ca4);
 // Put consonants and vowels (form is CVCV for each 3 bytes of data)
 for (i=0;i<4;i++)
 {
 if (i == 0) out += cons[ca4[0]];
 if (i == 1) out += vows[ca4[1]];
 if (i == 2) out += cons[ca4[2]];
 if (i == 3) out += vows[ca4[3]];
 }
 i = 0;
 }
 }
 if (i)
 {
 for (j=i;j<3;j++)
 ca3[j] = '0円';
 ca3_to_ca4(ca3,ca4);
 for (j=0;j<(i+1);j++)
 {
 if (j == 0) out += cons[ca4[0]];
 if (j == 1) out += vows[ca4[1]];
 if (j == 2) out += cons[ca4[2]];
 if (j == 3) out += vows[ca4[3]];
 }
 // Put padding
 while ((i++ < 3))
 out += "?";
 }
}
void decode(const char *data,uint32_t length,std::string& out)
{
 uint32_t i = 0, j = 0, si = 0;
 uint8_t ca3[3];
 uint8_t ca4[4];
 std::string sa4[4];
 uint32_t slen = length;
 std::vector<std::string> vow_v; // Vowels
 std::vector<std::string> con_v; // Consonants
 std::vector<std::string> all_v; // All data to decode
 std::string vow_tmp, con_tmp, pad_tmp; // Placeholders for consonants, vowels and padding
 uint32_t padc = 0;
 while ((i < slen))
 {
 if (is_consonant(data[i]))
 {
 con_tmp += data[i];
 vow_tmp += '*'; pad_tmp += '*';
 }
 if (is_vowel(data[i]))
 {
 vow_tmp += data[i];
 con_tmp += '*'; pad_tmp += '*';
 }
 if (data[i] == '?')
 {
 pad_tmp += data[i];
 con_tmp += '*'; vow_tmp += '*';
 padc++;
 }
 i++;
 }
 // Split
 vow_v = split(vow_tmp.c_str(),'*');
 con_v = split(con_tmp.c_str(),'*');
 // Put consonants and vowels (form is CVCV for each 3 bytes of data)
 uint32_t np = vow_v.size()+con_v.size();
 for (i=0;i<np;i++)
 {
 if (i%2 == 0) all_v.push_back(con_v[i/2]);
 if (i%2 == 1) all_v.push_back(vow_v[i/2]);
 }
 // Put padding
 for (i=0;i<padc;i++)
 all_v.push_back("?");
 uint32_t vlen = all_v.size();
 i = 0;
 while (vlen-- && (all_v[si] != "?"))
 {
 // Find which position is the one that corresponds to cons. and vow.
 if (si%2 == 0) ca4[i++] = (uint8_t)string_find(cons,con_v[si/2],128);
 if (si%2 == 1) ca4[i++] = (uint8_t)string_find(vows,vow_v[si/2], 32);
 si++;
 if (i == 4)
 {
 ca4_to_ca3(ca4,ca3);
 for (i=0;i<3;i++)
 {
 if (i == 0) out += ca3[0];
 if (i == 1) out += ca3[1];
 if (i == 2) out += ca3[2];
 }
 i = 0;
 }
 }
 if (i)
 {
 for (j=i;j<4;j++)
 ca4[j] = 0;
 ca4_to_ca3(ca4,ca3);
 for (j=0;j<(i-1);j++)
 {
 if (j == 0) out += ca3[0];
 if (j == 1) out += ca3[1];
 if (j == 2) out += ca3[2];
 }
 }
}
} // end of "bitspeak" namespace
#endif

bitspeak_test.cpp

#include <iostream>
#include <cstdlib>
#include <cstring>
#include <string>
#include "bitspeak.hpp"
using namespace std;
int main(int argc,char **argv)
{
 string tests[4] = {
 "Technological progress is like an axe in the hands of a pathological criminal.",
 "Bitspeak test",
 "123456789abcdef",
 " ((( d(O_o)b ))) "
 };
 cout << ".....--------============= Bitspeak test =============--------.....\n";
 for (int i=0;i<4;i++)
 {
 string encoded, decoded;
 cout << "Test string " << (i+1) << ": " << tests[i] << "\n";
 bitspeak::encode(
 (uint8_t*)tests[i].c_str(),
 tests[i].length(),
 encoded
 );
 cout << "Encoded: " << encoded << "\n";
 bitspeak::decode(
 encoded.c_str(),
 encoded.length(),
 decoded
 );
 cout << "Decoded: " << decoded << "\n\n";
 }
}

Here is the output of the test program:

.....--------============= Bitspeak test =============--------.....
Test string 1: Technological progress is like an axe in the hands of a pathological criminal.
Encoded: hryhsolbytneildyxzaelboacleldifeolnyxzaelnyhsialneyfaolneyfaylboarswvypeevypoukreyfaolgifiolbyhfalbypeekraechtalgoaklajteyfeojtoiglailgoasfeiktoaprojtoaschakloibcaoldoapreejtoaschee
Decoded: Technological progress is like an axe in the hands of a pathological criminal.
Test string 2: Bitspeak test
Encoded: fsypriolnoifwjtoarpalpyhsialpa??
Decoded: Bitspeak test
Test string 3: 123456789abcdef
Encoded: chsiaxiackohfoacliamrowjtoabcokryhsy
Decoded: 123456789abcdef
Test string 4: ((( d(O_o)b ))) 
Encoded: vimraiblifubluxcwilgeyprivinzaobleyb?
Decoded: ((( d(O_o)b )))

Question 2

Can you add expected outcome of your test strings encoding?

Question 3

@vnp Yes, now I added it.

Question 4

your sample outputs dont follow the cvcv pattern you give in the description

Question 5

@pm100 I was referring to the sounds used, that are combinations of each kind of letter (consonants and vowels).

Question 6

Your test program should not just print the values for manual inspection, it should check that the conversion's results are the expected ones, and otherwise fail. For more information on this subject, search for unit tests.

Do not use identifiers starting with an underscore (e.g. _a3). Such identifiers are reserved for the system. Whenever you use them, your code looks like it were poking around in the innards of the system.

Don't use using namespace std (just search for this on Stack Overflow).

Bug: when a string consists of only delimiters, your code accesses str[-1], which leads to undefined behavior.

Question 7

Now I fixed the bug in the split function: I put the for loops into an if (str1.length() > 0) { ... } conditional (if the input is a string of only delimiters, str1 will be empty and those loops will do nothing), so it will return an empty vector instead of fail. Also, I did the checking of converted/expected results, on the test program. Thank you!

Question 8

File organization

You should have three files:

bitspeak.hpp, the header containing only the minimally necessary definitions
bitspeak.cpp, the implementation of the functions defined in the header
bitspeak_test.cpp, the tests for the implementation

bitspeak.hpp

#ifndef BITSPEAK_HPP
#define BITSPEAK_HPP
#include <string>
namespace bitspeak
{
 std::string encode(const std::string &data);
 std::string decode(const std::string &text);
}
#endif

This is the simplest definition of the interface that a caller needs. The std::string class can handle all bytes, therefore it is suitable both for strings of char, as well as for byte arrays.

bitspeak.cpp

Instead of writing the vows and cons in a single line, I rather like them to look like this:

static const std::string vows[32] = {
 "a", "e", "i", "o", "u", "w", "y", "ae",
 "ai", "ao", "au", "aw", "ay", "ea", "ee", "ei",
 "eo", "eu", "ey", "ia", "io", "iu", "oa", "oi",
 "ou", "ow", "oy", "ui", "uo", "uy", "we", "wi"
};
static const std::string cons[128] = {
 "b", "c", "d", "f", "g", "h", "j", "k",
 "l", "m", "n", "p", "q", "r", "s", "t",
 "v", "x", "z", "bc", "bl", "br", "bs", "ch",
 "chs", "cht", "ck", "cl", "cr", "cs", "ct", "fl",
 "fr", "fs", "ft", "gl", "gm", "gr", "gs", "gz",
 "hd", "hf", "hr", "hs", "ht", "hv", "jr", "js",
 "jt", "kl", "kr", "kt", "lb", "lc", "ld", "lg",
 "lm", "ln", "lp", "ls", "lt", "lv", "mb", "mn",
 "mp", "mr", "nd", "nk", "nm", "np", "ns", "nt",
 "nv", "nz", "pl", "pr", "ps", "pt", "pz", "rb",
 "rc", "rch", "rd", "rf", "rg", "rk", "rl", "rm",
 "rn", "rp", "rr", "rs", "rt", "rv", "rz", "sb",
 "sc", "sch", "sd", "sf", "sh", "sl", "sm", "sn",
 "sp", "sq", "sr", "st", "sth", "str", "sv", "sz",
 "tch", "th", "tl", "tn", "tr", "ts", "vl", "vr",
 "vs", "vz", "xc", "xz", "zd", "zl", "zp", "zt"
};

Grouping them in 8 per line lets the reader quickly check that these are indeed 32 respectively 128 strings.

The implementation of bitspeak::encode can be done in a single pass without unnecessary copying:

namespace bitspeak {
std::string encode(const std::string &data)
{
 std::ostringstream out;
 std::size_t len = data.length();
 for (std::size_t i = 0; i < len; i += 3) {
 uint8_t d0 = data[i + 0];
 uint8_t d1 = i + 1 < len ? data[i + 1] : 0;
 uint8_t d2 = i + 2 < len ? data[i + 2] : 0;
 uint8_t cc0 = (d0 & 0xFE) >> 1;
 uint8_t vv0 = ((d0 & 0x01) << 4) | ((d1 & 0xF0) >> 4);
 uint8_t cc1 = ((d1 & 0x0F) << 3) | ((d2 & 0xE0) >> 5);
 uint8_t vv1 = d2 & 0x1F;
 out << cons[cc0];
 out << vows[vv0];
 out << (i + 1 < len ? cons[cc1] : "?");
 out << (i + 2 < len ? vows[vv1] : "?");
 }
 return out.str();
}
} // end of namespace

I have changed the following things:

There is a single loop, which processes 3 bytes at a time (note that i += 3 is used here instead of the usual i++).
Instead of using uint32_t, the code uses std::size_t, which is exactly the type returned by std::string.length().
The first block that defines d0, d1 and d2 makes sure that the data bytes are interpreted as unsigned.
They are then converted in the same way that you used in the ca3_to_ca4 function.
The last block appends the consonants and vowels. Even in the last group, the first pair of consonants and vowels is always relevant, since the bits from d0 influence cc0 and vv0. The remaining characters may need to be replaced with the padding character.
The class std::ostringstream is suitable for efficient appending to a string.

For decoding a string of consonants and vowels into the original data, idiomatic C++ code does not copy much data around. Instead, it uses indexes or iterators into the strings, which are only pointers that can be efficiently passed around.

The basic idea is this:

find a sequence of consonants
find a sequence of vowels
find a sequence of consonants
find a sequence of vowels
map the above sequences to indices
bitshift the indices around to produce the original data
repeat until the end of the encoded string

The padding needs to be handled specially since it is only allowed at the very end. And this is the code I came up with:

std::string decode(const std::string &text)
{
 std::size_t i = 0;
 std::size_t len = text.length();
 std::ostringstream out;
 while (i < len) {
 auto cc0start = i;
 while (i < len && con_l.find(text[i]) != std::string::npos)
 i++;
 auto vv0start = i;
 while (i < len && vow_l.find(text[i]) != std::string::npos)
 i++;
 auto cc1start = i;
 while (i < len && con_l.find(text[i]) != std::string::npos)
 i++;
 auto vv1start = i;
 while (i < len && vow_l.find(text[i]) != std::string::npos)
 i++;
 if (i == cc0start)
 throw "invalid encoded data (neither consonant nor vowel)";
 auto cc0 = text.substr(cc0start, vv0start - cc0start);
 auto vv0 = text.substr(vv0start, cc1start - vv0start);
 auto cc1 = text.substr(cc1start, vv1start - cc1start);
 auto vv1 = text.substr(vv1start, i - vv1start);
 auto cons_end = cons + 128;
 auto vows_end = vows + 32;
 auto cc0bits = std::find(cons, cons_end, cc0) - cons;
 auto vv0bits = std::find(vows, vows_end, vv0) - vows;
 auto cc1bits = std::find(cons, cons_end, cc1) - cons;
 auto vv1bits = std::find(vows, vows_end, vv1) - vows;
 if (cc0bits == 128 || vv0bits == 32)
 throw "invalid encoded data (unknown letter combination)";
 unsigned char d0 = (cc0bits << 1) | ((vv0bits & 0x10) >> 4);
 out << d0;
 if (cc1bits == 128) {
 if (text.substr(i) != "??")
 throw "invalid encoded data (padding)";
 break;
 }
 unsigned char d1 = ((vv0bits & 0x0F) << 4) | ((cc1bits & 0x78) >> 3);
 out << d1;
 if (vv1bits == 32) {
 if (text.substr(i) != "?")
 throw "invalid encoded data (padding)";
 break;
 }
 unsigned char d2 = ((cc1bits & 0x07) << 5) | (vv1bits & 0x1F);
 out << d2;
 }
 return out.str();
}

I know that the above code violates some best practice rules:

it contains duplicate code in the while loops. I did this since extracting the loops to a separate function would add even more code.
it contains hard-coded numbers, which some readers may consider magic numbers. But since they are powers of 2 and mentioned in the specification, they are not magic, they are only numbers, like the other numbers in the bit-shifting code.
these numbers are even repeated in several places. I did not make them constants since they will not change anyway. And even if they should do, each 128 in the code has exactly the same meaning, so they can all be changed doing a simple text replacement.

Your is_vowel function can be shortened a bit.

bool is_vowel(char x) { bool o = false; std::size_t i; for (i = 0; i < vow_l.size(); i++) { if (x == vow_l[i]) { o = true; break; } } return o; }
bool is_vowel(char x) { std::size_t i; for (i = 0; i < vow_l.size(); i++) { if (x == vow_l[i]) { return true; } } return false; }
bool is_vowel(char x) { for (std::size_t i = 0; i < vow_l.size(); i++) { if (x == vow_l[i]) { return true; } } return false; }
bool is_vowel(char x) { return vow_l.find(x) != std::string::npos; }

Instead of using break and a return value, it is often simpler to just return value, e.g. in string_find.

Roland Illig Roland Illig 21.8k2 gold badges36 silver badges83 bronze badges · Accepted Answer · 2016-10-24 06:47:35Z

Your test program should not just print the values for manual inspection, it should check that the conversion's results are the expected ones, and otherwise fail. For more information on this subject, search for unit tests.

Do not use identifiers starting with an underscore (e.g. _a3). Such identifiers are reserved for the system. Whenever you use them, your code looks like it were poking around in the innards of the system.

Don't use using namespace std (just search for this on Stack Overflow).

Bug: when a string consists of only delimiters, your code accesses str[-1], which leads to undefined behavior.

Now I fixed the bug in the split function: I put the for loops into an if (str1.length() > 0) { ... } conditional (if the input is a string of only delimiters, str1 will be empty and those loops will do nothing), so it will return an empty vector instead of fail. Also, I did the checking of converted/expected results, on the test program. Thank you!

Stack Exchange Network

Pronounceable binary-to-text encoding

2 Answers 2

File organization

bitspeak.hpp

bitspeak.cpp

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Hot Network Questions

Pronounceable binary-to-text encoding

2 Answers 2

File organization

bitspeak.hpp

bitspeak.cpp

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Related

Hot Network Questions