This is a binary-to-text encoder that is based on Base64, but it works by converting data into an almost pronounceable string (consonant-vowel-consonant-vowel for each 3 bytes). Any improvements, suggestions, etc. are welcome.
bitspeak.hpp
#ifndef BITSPEAK_HPP
#define BITSPEAK_HPP
/*
Encoding:
******** ******** ********
CCCCCCCV VVVVCCCC CCCVVVVV
-CCCCCCC ---VVVVV -CCCCCCC ---VVVVV
C: Consonant
V: Vowel
Decoding:
-******* ---***** -******* ---*****
-xxxxxxx ---xyyyy -yyyyzzz ---zzzzz
xxxxxxxx yyyyyyyy zzzzzzzz
*/
#include <iostream>
#include <cstdlib>
#include <cstdint>
#include <cstring>
#include <string>
#include <sstream>
#include <vector>
namespace bitspeak
{
static const std::string vows[32] = {"a","e","i","o","u","w","y","ae","ai","ao","au","aw","ay","ea","ee","ei","eo","eu","ey","ia","io","iu","oa","oi","ou","ow","oy","ui","uo","uy","we","wi"};
static const std::string cons[128] = {"b","c","d","f","g","h","j","k","l","m","n","p","q","r","s","t","v","x","z","bc","bl","br","bs","ch","chs","cht","ck","cl","cr","cs","ct","fl","fr","fs","ft","gl","gm","gr","gs","gz","hd","hf","hr","hs","ht","hv","jr","js","jt","kl","kr","kt","lb","lc","ld","lg","lm","ln","lp","ls","lt","lv","mb","mn","mp","mr","nd","nk","nm","np","ns","nt","nv","nz","pl","pr","ps","pt","pz","rb","rc","rch","rd","rf","rg","rk","rl","rm","rn","rp","rr","rs","rt","rv","rz","sb","sc","sch","sd","sf","sh","sl","sm","sn","sp","sq","sr","st","sth","str","sv","sz","tch","th","tl","tn","tr","ts","vl","vr","vs","vz","xc","xz","zd","zl","zp","zt"};
static const std::string vow_l = "aeiouwy";
static const std::string con_l = "bcdfghjklmnpqrstvxz";
std::vector<std::string> split(const char* str,char delim)
{
std::stringstream ss;
std::string str1 = str;
std::vector<std::string> splitted;
std::string rfs;
while (str1[0] == delim) { str1.erase(0,1); }
int se = str1.length();
while (str1[se-1] == delim) { str1.erase(se-1,se); se = str1.length(); }
for (int i=0;i<str1.length()-1;i++)
{
if (str1[i] == delim && str1[i+1] == delim)
continue;
else
rfs += str1[i];
}
rfs += str1[str1.length()-1];
for (int i=0;i<rfs.length()+1;i++)
{
if (rfs[i] != delim && i != rfs.length())
ss << rfs[i];
else
{
splitted.push_back(ss.str());
ss.str(std::string());
}
}
return splitted;
}
void ca3_to_ca4(uint8_t _a3[3],uint8_t _a4[4])
{
_a4[0] = (_a3[0]&0xFE) >> 1;
_a4[1] = ((_a3[0]&0x01) << 4) | ((_a3[1]&0xF0) >> 4);
_a4[2] = ((_a3[1]&0x0F) << 3) | ((_a3[2]&0xE0) >> 5);
_a4[3] = _a3[2]&0x1F;
}
void ca4_to_ca3(uint8_t _a4[4],uint8_t _a3[3])
{
_a3[0] = ((_a4[0]&0x7F) << 1) | ((_a4[1]&0x10) >> 4);
_a3[1] = ((_a4[1]&0x0F) << 4) | ((_a4[2]&0x78) >> 3);
_a3[2] = ((_a4[2]&0x07) << 5) | (_a4[3]&0x1F);
}
bool is_vowel(char x) { bool o = false; int i; for (i=0;i<vow_l.size();i++) { if (x == vow_l[i]) { o = true; break; } } return o; }
bool is_consonant(char x) { bool o = false; int i; for (i=0;i<con_l.size();i++) { if (x == con_l[i]) { o = true; break; } } return o; }
uint32_t string_find(const std::string *arr,std::string x,uint32_t size)
{
uint32_t idx = -1, i = 0;
for (i=0;i<size;i++)
{
if (x == arr[i])
{
idx = i;
break;
}
}
return idx;
}
void encode(uint8_t *data,uint32_t length,std::string& out)
{
uint32_t i = 0, j = 0;
uint8_t ca3[3];
uint8_t ca4[4];
while (length--)
{
ca3[i++] = *(data++);
if (i == 3)
{
ca3_to_ca4(ca3,ca4);
// Put consonants and vowels (form is CVCV for each 3 bytes of data)
for (i=0;i<4;i++)
{
if (i == 0) out += cons[ca4[0]];
if (i == 1) out += vows[ca4[1]];
if (i == 2) out += cons[ca4[2]];
if (i == 3) out += vows[ca4[3]];
}
i = 0;
}
}
if (i)
{
for (j=i;j<3;j++)
ca3[j] = '0円';
ca3_to_ca4(ca3,ca4);
for (j=0;j<(i+1);j++)
{
if (j == 0) out += cons[ca4[0]];
if (j == 1) out += vows[ca4[1]];
if (j == 2) out += cons[ca4[2]];
if (j == 3) out += vows[ca4[3]];
}
// Put padding
while ((i++ < 3))
out += "?";
}
}
void decode(const char *data,uint32_t length,std::string& out)
{
uint32_t i = 0, j = 0, si = 0;
uint8_t ca3[3];
uint8_t ca4[4];
std::string sa4[4];
uint32_t slen = length;
std::vector<std::string> vow_v; // Vowels
std::vector<std::string> con_v; // Consonants
std::vector<std::string> all_v; // All data to decode
std::string vow_tmp, con_tmp, pad_tmp; // Placeholders for consonants, vowels and padding
uint32_t padc = 0;
while ((i < slen))
{
if (is_consonant(data[i]))
{
con_tmp += data[i];
vow_tmp += '*'; pad_tmp += '*';
}
if (is_vowel(data[i]))
{
vow_tmp += data[i];
con_tmp += '*'; pad_tmp += '*';
}
if (data[i] == '?')
{
pad_tmp += data[i];
con_tmp += '*'; vow_tmp += '*';
padc++;
}
i++;
}
// Split
vow_v = split(vow_tmp.c_str(),'*');
con_v = split(con_tmp.c_str(),'*');
// Put consonants and vowels (form is CVCV for each 3 bytes of data)
uint32_t np = vow_v.size()+con_v.size();
for (i=0;i<np;i++)
{
if (i%2 == 0) all_v.push_back(con_v[i/2]);
if (i%2 == 1) all_v.push_back(vow_v[i/2]);
}
// Put padding
for (i=0;i<padc;i++)
all_v.push_back("?");
uint32_t vlen = all_v.size();
i = 0;
while (vlen-- && (all_v[si] != "?"))
{
// Find which position is the one that corresponds to cons. and vow.
if (si%2 == 0) ca4[i++] = (uint8_t)string_find(cons,con_v[si/2],128);
if (si%2 == 1) ca4[i++] = (uint8_t)string_find(vows,vow_v[si/2], 32);
si++;
if (i == 4)
{
ca4_to_ca3(ca4,ca3);
for (i=0;i<3;i++)
{
if (i == 0) out += ca3[0];
if (i == 1) out += ca3[1];
if (i == 2) out += ca3[2];
}
i = 0;
}
}
if (i)
{
for (j=i;j<4;j++)
ca4[j] = 0;
ca4_to_ca3(ca4,ca3);
for (j=0;j<(i-1);j++)
{
if (j == 0) out += ca3[0];
if (j == 1) out += ca3[1];
if (j == 2) out += ca3[2];
}
}
}
} // end of "bitspeak" namespace
#endif
bitspeak_test.cpp
#include <iostream>
#include <cstdlib>
#include <cstring>
#include <string>
#include "bitspeak.hpp"
using namespace std;
int main(int argc,char **argv)
{
string tests[4] = {
"Technological progress is like an axe in the hands of a pathological criminal.",
"Bitspeak test",
"123456789abcdef",
" ((( d(O_o)b ))) "
};
cout << ".....--------============= Bitspeak test =============--------.....\n";
for (int i=0;i<4;i++)
{
string encoded, decoded;
cout << "Test string " << (i+1) << ": " << tests[i] << "\n";
bitspeak::encode(
(uint8_t*)tests[i].c_str(),
tests[i].length(),
encoded
);
cout << "Encoded: " << encoded << "\n";
bitspeak::decode(
encoded.c_str(),
encoded.length(),
decoded
);
cout << "Decoded: " << decoded << "\n\n";
}
}
Here is the output of the test program:
.....--------============= Bitspeak test =============--------..... Test string 1: Technological progress is like an axe in the hands of a pathological criminal. Encoded: hryhsolbytneildyxzaelboacleldifeolnyxzaelnyhsialneyfaolneyfaylboarswvypeevypoukreyfaolgifiolbyhfalbypeekraechtalgoaklajteyfeojtoiglailgoasfeiktoaprojtoaschakloibcaoldoapreejtoaschee Decoded: Technological progress is like an axe in the hands of a pathological criminal. Test string 2: Bitspeak test Encoded: fsypriolnoifwjtoarpalpyhsialpa?? Decoded: Bitspeak test Test string 3: 123456789abcdef Encoded: chsiaxiackohfoacliamrowjtoabcokryhsy Decoded: 123456789abcdef Test string 4: ((( d(O_o)b ))) Encoded: vimraiblifubluxcwilgeyprivinzaobleyb? Decoded: ((( d(O_o)b )))
-
\$\begingroup\$ Can you add expected outcome of your test strings encoding? \$\endgroup\$vnp– vnp2016年10月24日 06:42:19 +00:00Commented Oct 24, 2016 at 6:42
-
\$\begingroup\$ @vnp Yes, now I added it. \$\endgroup\$sator.arepo.tenet.opera.rotas– sator.arepo.tenet.opera.rotas2016年10月24日 18:35:25 +00:00Commented Oct 24, 2016 at 18:35
-
\$\begingroup\$ your sample outputs dont follow the cvcv pattern you give in the description \$\endgroup\$pm100– pm1002016年10月24日 19:04:37 +00:00Commented Oct 24, 2016 at 19:04
-
\$\begingroup\$ @pm100 I was referring to the sounds used, that are combinations of each kind of letter (consonants and vowels). \$\endgroup\$sator.arepo.tenet.opera.rotas– sator.arepo.tenet.opera.rotas2016年10月25日 01:48:49 +00:00Commented Oct 25, 2016 at 1:48
2 Answers 2
Your test program should not just print the values for manual inspection, it should check that the conversion's results are the expected ones, and otherwise fail. For more information on this subject, search for unit tests.
Do not use identifiers starting with an underscore (e.g. _a3
). Such identifiers are reserved for the system. Whenever you use them, your code looks like it were poking around in the innards of the system.
Don't use using namespace std
(just search for this on Stack Overflow).
Bug: when a string consists of only delimiters, your code accesses str[-1]
, which leads to undefined behavior.
-
\$\begingroup\$ Now I fixed the bug in the
split
function: I put thefor
loops into anif (str1.length() > 0) { ... }
conditional (if the input is a string of only delimiters,str1
will be empty and those loops will do nothing), so it will return an empty vector instead of fail. Also, I did the checking of converted/expected results, on the test program. Thank you! \$\endgroup\$sator.arepo.tenet.opera.rotas– sator.arepo.tenet.opera.rotas2016年10月26日 05:06:55 +00:00Commented Oct 26, 2016 at 5:06
File organization
You should have three files:
bitspeak.hpp
, the header containing only the minimally necessary definitionsbitspeak.cpp
, the implementation of the functions defined in the headerbitspeak_test.cpp
, the tests for the implementation
bitspeak.hpp
#ifndef BITSPEAK_HPP
#define BITSPEAK_HPP
#include <string>
namespace bitspeak
{
std::string encode(const std::string &data);
std::string decode(const std::string &text);
}
#endif
This is the simplest definition of the interface that a caller needs. The std::string
class can handle all bytes, therefore it is suitable both for strings of char
, as well as for byte arrays.
bitspeak.cpp
Instead of writing the vows
and cons
in a single line, I rather like them to look like this:
static const std::string vows[32] = {
"a", "e", "i", "o", "u", "w", "y", "ae",
"ai", "ao", "au", "aw", "ay", "ea", "ee", "ei",
"eo", "eu", "ey", "ia", "io", "iu", "oa", "oi",
"ou", "ow", "oy", "ui", "uo", "uy", "we", "wi"
};
static const std::string cons[128] = {
"b", "c", "d", "f", "g", "h", "j", "k",
"l", "m", "n", "p", "q", "r", "s", "t",
"v", "x", "z", "bc", "bl", "br", "bs", "ch",
"chs", "cht", "ck", "cl", "cr", "cs", "ct", "fl",
"fr", "fs", "ft", "gl", "gm", "gr", "gs", "gz",
"hd", "hf", "hr", "hs", "ht", "hv", "jr", "js",
"jt", "kl", "kr", "kt", "lb", "lc", "ld", "lg",
"lm", "ln", "lp", "ls", "lt", "lv", "mb", "mn",
"mp", "mr", "nd", "nk", "nm", "np", "ns", "nt",
"nv", "nz", "pl", "pr", "ps", "pt", "pz", "rb",
"rc", "rch", "rd", "rf", "rg", "rk", "rl", "rm",
"rn", "rp", "rr", "rs", "rt", "rv", "rz", "sb",
"sc", "sch", "sd", "sf", "sh", "sl", "sm", "sn",
"sp", "sq", "sr", "st", "sth", "str", "sv", "sz",
"tch", "th", "tl", "tn", "tr", "ts", "vl", "vr",
"vs", "vz", "xc", "xz", "zd", "zl", "zp", "zt"
};
Grouping them in 8 per line lets the reader quickly check that these are indeed 32 respectively 128 strings.
The implementation of bitspeak::encode
can be done in a single pass without unnecessary copying:
namespace bitspeak {
std::string encode(const std::string &data)
{
std::ostringstream out;
std::size_t len = data.length();
for (std::size_t i = 0; i < len; i += 3) {
uint8_t d0 = data[i + 0];
uint8_t d1 = i + 1 < len ? data[i + 1] : 0;
uint8_t d2 = i + 2 < len ? data[i + 2] : 0;
uint8_t cc0 = (d0 & 0xFE) >> 1;
uint8_t vv0 = ((d0 & 0x01) << 4) | ((d1 & 0xF0) >> 4);
uint8_t cc1 = ((d1 & 0x0F) << 3) | ((d2 & 0xE0) >> 5);
uint8_t vv1 = d2 & 0x1F;
out << cons[cc0];
out << vows[vv0];
out << (i + 1 < len ? cons[cc1] : "?");
out << (i + 2 < len ? vows[vv1] : "?");
}
return out.str();
}
} // end of namespace
I have changed the following things:
- There is a single loop, which processes 3 bytes at a time (note that
i += 3
is used here instead of the usuali++
). - Instead of using
uint32_t
, the code usesstd::size_t
, which is exactly the type returned bystd::string.length()
. - The first block that defines
d0
,d1
andd2
makes sure that the data bytes are interpreted as unsigned. - They are then converted in the same way that you used in the
ca3_to_ca4
function. - The last block appends the consonants and vowels. Even in the last group, the first pair of consonants and vowels is always relevant, since the bits from
d0
influencecc0
andvv0
. The remaining characters may need to be replaced with the padding character. - The class
std::ostringstream
is suitable for efficient appending to a string.
For decoding a string of consonants and vowels into the original data, idiomatic C++ code does not copy much data around. Instead, it uses indexes or iterators into the strings, which are only pointers that can be efficiently passed around.
The basic idea is this:
- find a sequence of consonants
- find a sequence of vowels
- find a sequence of consonants
- find a sequence of vowels
- map the above sequences to indices
- bitshift the indices around to produce the original data
- repeat until the end of the encoded string
The padding needs to be handled specially since it is only allowed at the very end. And this is the code I came up with:
std::string decode(const std::string &text)
{
std::size_t i = 0;
std::size_t len = text.length();
std::ostringstream out;
while (i < len) {
auto cc0start = i;
while (i < len && con_l.find(text[i]) != std::string::npos)
i++;
auto vv0start = i;
while (i < len && vow_l.find(text[i]) != std::string::npos)
i++;
auto cc1start = i;
while (i < len && con_l.find(text[i]) != std::string::npos)
i++;
auto vv1start = i;
while (i < len && vow_l.find(text[i]) != std::string::npos)
i++;
if (i == cc0start)
throw "invalid encoded data (neither consonant nor vowel)";
auto cc0 = text.substr(cc0start, vv0start - cc0start);
auto vv0 = text.substr(vv0start, cc1start - vv0start);
auto cc1 = text.substr(cc1start, vv1start - cc1start);
auto vv1 = text.substr(vv1start, i - vv1start);
auto cons_end = cons + 128;
auto vows_end = vows + 32;
auto cc0bits = std::find(cons, cons_end, cc0) - cons;
auto vv0bits = std::find(vows, vows_end, vv0) - vows;
auto cc1bits = std::find(cons, cons_end, cc1) - cons;
auto vv1bits = std::find(vows, vows_end, vv1) - vows;
if (cc0bits == 128 || vv0bits == 32)
throw "invalid encoded data (unknown letter combination)";
unsigned char d0 = (cc0bits << 1) | ((vv0bits & 0x10) >> 4);
out << d0;
if (cc1bits == 128) {
if (text.substr(i) != "??")
throw "invalid encoded data (padding)";
break;
}
unsigned char d1 = ((vv0bits & 0x0F) << 4) | ((cc1bits & 0x78) >> 3);
out << d1;
if (vv1bits == 32) {
if (text.substr(i) != "?")
throw "invalid encoded data (padding)";
break;
}
unsigned char d2 = ((cc1bits & 0x07) << 5) | (vv1bits & 0x1F);
out << d2;
}
return out.str();
}
I know that the above code violates some best practice rules:
- it contains duplicate code in the
while
loops. I did this since extracting the loops to a separate function would add even more code. - it contains hard-coded numbers, which some readers may consider magic numbers. But since they are powers of 2 and mentioned in the specification, they are not magic, they are only numbers, like the other numbers in the bit-shifting code.
- these numbers are even repeated in several places. I did not make them constants since they will not change anyway. And even if they should do, each
128
in the code has exactly the same meaning, so they can all be changed doing a simple text replacement.
Your is_vowel
function can be shortened a bit.
bool is_vowel(char x) { bool o = false; std::size_t i; for (i = 0; i < vow_l.size(); i++) { if (x == vow_l[i]) { o = true; break; } } return o; }
bool is_vowel(char x) { std::size_t i; for (i = 0; i < vow_l.size(); i++) { if (x == vow_l[i]) { return true; } } return false; }
bool is_vowel(char x) { for (std::size_t i = 0; i < vow_l.size(); i++) { if (x == vow_l[i]) { return true; } } return false; }
bool is_vowel(char x) { return vow_l.find(x) != std::string::npos; }
Instead of using break
and a return value, it is often simpler to just return value
, e.g. in string_find
.