#include <gtest/gtest.h>
#include <algorithm>
#include <string>
auto tr_string(auto const& tr, std::wstring s)
{
std::ranges::transform(s, s.begin(), tr);
return s;
}
TEST(tr, bad_args)
{
EXPECT_THROW(make_substitutor(L"abcde", L"bcde"), std::invalid_argument);
}
TEST(tr, abcdenoop)
{
auto const tr = make_substitutor(L"abcde"L"", L"bcdea"L"");
EXPECT_EQ(tr_string(tr, L""), L"");
EXPECT_EQ(tr_string(tr, L"fedcba"L"hello"), L"faedcb"L"hello");
}
TEST(tr, english)
{
auto const tr = make_substitutor(L"ehlo", L"ipza");
EXPECT_EQ(tr_string(tr, L""), L"");
EXPECT_EQ(tr_string(tr, L"hello"), L"pizza");
}
TEST(tr, greek)
{
auto const tr = make_substitutor(L"αβγδεζηθικλμνξοπρσςτυφχψω",
L"ΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΣΤΥΦΧΨΩ");
EXPECT_EQ(tr_string(tr, L"Γεια σας"), L"ΓΕΙΑ ΣΑΣ");
}
Quick demo
$ ./wchar-tr αβγδεζηθικλμνξοπρσςτυφχψω ΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΣΤΥΦΧΨΩ <<<'Γεια σας'
ΓΕΙΑ ΣΑΣ
#include <gtest/gtest.h>
#include <algorithm>
#include <string>
auto tr_string(auto const& tr, std::wstring s)
{
std::ranges::transform(s, s.begin(), tr);
return s;
}
TEST(tr, bad_args)
{
EXPECT_THROW(make_substitutor(L"abcde", L"bcde"), std::invalid_argument);
}
TEST(tr, abcde)
{
auto const tr = make_substitutor(L"abcde", L"bcdea");
EXPECT_EQ(tr_string(tr, L""), L"");
EXPECT_EQ(tr_string(tr, L"fedcba"), L"faedcb");
}
TEST(tr, greek)
{
auto const tr = make_substitutor(L"αβγδεζηθικλμνξοπρσςτυφχψω",
L"ΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΣΤΥΦΧΨΩ");
EXPECT_EQ(tr_string(tr, L"Γεια σας"), L"ΓΕΙΑ ΣΑΣ");
}
#include <gtest/gtest.h>
#include <algorithm>
#include <string>
auto tr_string(auto const& tr, std::wstring s)
{
std::ranges::transform(s, s.begin(), tr);
return s;
}
TEST(tr, bad_args)
{
EXPECT_THROW(make_substitutor(L"abcde", L"bcde"), std::invalid_argument);
}
TEST(tr, noop)
{
auto const tr = make_substitutor(L"", L"");
EXPECT_EQ(tr_string(tr, L""), L"");
EXPECT_EQ(tr_string(tr, L"hello"), L"hello");
}
TEST(tr, english)
{
auto const tr = make_substitutor(L"ehlo", L"ipza");
EXPECT_EQ(tr_string(tr, L""), L"");
EXPECT_EQ(tr_string(tr, L"hello"), L"pizza");
}
TEST(tr, greek)
{
auto const tr = make_substitutor(L"αβγδεζηθικλμνξοπρσςτυφχψω",
L"ΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΣΤΥΦΧΨΩ");
EXPECT_EQ(tr_string(tr, L"Γεια σας"), L"ΓΕΙΑ ΣΑΣ");
}
Quick demo
$ ./wchar-tr αβγδεζηθικλμνξοπρσςτυφχψω ΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΣΤΥΦΧΨΩ <<<'Γεια σας'
ΓΕΙΑ ΣΑΣ
Transliterate wide-character input
One shortcoming of GNU's implementation of the tr
utility is that it operates on bytes rather than characters. As the man page says under BUGS:
Full support is available only for safe single-byte locales, in which every possible input byte represents a single character.
I've created a limited replacement that works with wide characters according to the inherited locale.
Known limitations
- None of the options of
tr
are supported, such as-d
to delete matched characters instead of replacing them, or-c
to complement the match set. These two would be fairly easy to implement; the-s
option to squeeze successive matches less so. - The "from" and "to" arguments must match in length, and there's no support for ranges, character classes or octal notation. Notably, this prevents us matching or substituting null characters.
- Replacement is by wide-character code-point. So there's no normalisation of combining accents, and if your wide-character coding uses multi-char sequences (hello, UTF-16) then it's not usable for those.
These limitations are acceptable for my use-cases, so I don't need to change anything.
The code
I'll start with the core function:
#include <map>
#include <ranges>
#include <stdexcept>
#include <string_view>
#include <utility>
// Returns a function object that converts any character present in
// "from" to the corresponding character in "to".
auto make_substitutor(std::wstring_view from, std::wstring_view to)
{
if (from.size() != to.size()) {
throw std::invalid_argument("Replacement length mismatch");
}
auto map = std::views::zip(from, to) | std::ranges::to<std::map>();
return [map=std::move(map)](wchar_t c) {
auto it = map.find(c);
return it == map.end() ? c : it->second;
};
}
I have unit tests for it:
#include <gtest/gtest.h>
#include <algorithm>
#include <string>
auto tr_string(auto const& tr, std::wstring s)
{
std::ranges::transform(s, s.begin(), tr);
return s;
}
TEST(tr, bad_args)
{
EXPECT_THROW(make_substitutor(L"abcde", L"bcde"), std::invalid_argument);
}
TEST(tr, abcde)
{
auto const tr = make_substitutor(L"abcde", L"bcdea");
EXPECT_EQ(tr_string(tr, L""), L"");
EXPECT_EQ(tr_string(tr, L"fedcba"), L"faedcb");
}
TEST(tr, greek)
{
auto const tr = make_substitutor(L"αβγδεζηθικλμνξοπρσςτυφχψω",
L"ΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΣΤΥΦΧΨΩ");
EXPECT_EQ(tr_string(tr, L"Γεια σας"), L"ΓΕΙΑ ΣΑΣ");
}
And a main()
that makes it a complete program:
#include <algorithm>
#include <iostream>
#include <iterator>
#include <locale>
#include <cstdlib>
#include <cstring>
#include <print>
// Convert from platform string to wide string
static std::wstring arg_to_wstring(const char* arg)
{
std::wstring s(std::strlen(arg), '0円'); // likely over-allocation
auto len = std::mbstowcs(s.data(), arg, s.size());
if (len == -1uz) {
throw std::invalid_argument("Malformed multibyte string");
}
s.resize(len);
return s;
}
int main(int argc, char **argv)
{
if (!std::setlocale(LC_ALL, "")) {
println(std::cerr, "Invalid locale settings");
return EXIT_FAILURE;
}
if (argc != 3) {
println(std::cerr, "Usage: {} from_chars to_chars", *argv ? *argv : "tr");
return EXIT_FAILURE;
}
try {
std::wstring const from = arg_to_wstring(argv[1]);
std::wstring const to = arg_to_wstring(argv[2]);
std::wcin.exceptions(std::ios::badbit | std::ios::failbit);
std::wcout.exceptions(std::ios::badbit | std::ios::failbit);
std::transform(std::istreambuf_iterator<wchar_t>(std::wcin),
std::istreambuf_iterator<wchar_t>(),
std::ostreambuf_iterator<wchar_t>(std::wcout),
make_substitutor(from, to));
} catch (std::exception& e) {
println(std::cerr, "{}", e.what());
return EXIT_FAILURE;
}
}