Skip to main content
Code Review

Return to Question

Became Hot Network Question
Include a demo; more entertaining test
Source Link
Toby Speight
  • 87.7k
  • 14
  • 104
  • 325
#include <gtest/gtest.h>
#include <algorithm>
#include <string>
auto tr_string(auto const& tr, std::wstring s)
{
 std::ranges::transform(s, s.begin(), tr);
 return s;
}
TEST(tr, bad_args)
{
 EXPECT_THROW(make_substitutor(L"abcde", L"bcde"), std::invalid_argument);
}
TEST(tr, abcdenoop)
{
 auto const tr = make_substitutor(L"abcde"L"", L"bcdea"L"");
 EXPECT_EQ(tr_string(tr, L""), L"");
 EXPECT_EQ(tr_string(tr, L"fedcba"L"hello"), L"faedcb"L"hello");
}
TEST(tr, english)
{
 auto const tr = make_substitutor(L"ehlo", L"ipza");
 EXPECT_EQ(tr_string(tr, L""), L"");
  EXPECT_EQ(tr_string(tr, L"hello"), L"pizza");
}
TEST(tr, greek)
{
 auto const tr = make_substitutor(L"αβγδεζηθικλμνξοπρσςτυφχψω",
 L"ΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΣΤΥΦΧΨΩ");
 EXPECT_EQ(tr_string(tr, L"Γεια σας"), L"ΓΕΙΑ ΣΑΣ");
}

Quick demo

$ ./wchar-tr αβγδεζηθικλμνξοπρσςτυφχψω ΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΣΤΥΦΧΨΩ <<<'Γεια σας'
ΓΕΙΑ ΣΑΣ
#include <gtest/gtest.h>
#include <algorithm>
#include <string>
auto tr_string(auto const& tr, std::wstring s)
{
 std::ranges::transform(s, s.begin(), tr);
 return s;
}
TEST(tr, bad_args)
{
 EXPECT_THROW(make_substitutor(L"abcde", L"bcde"), std::invalid_argument);
}
TEST(tr, abcde)
{
 auto const tr = make_substitutor(L"abcde", L"bcdea");
 EXPECT_EQ(tr_string(tr, L""), L"");
 EXPECT_EQ(tr_string(tr, L"fedcba"), L"faedcb");
}
TEST(tr, greek)
{
 auto const tr = make_substitutor(L"αβγδεζηθικλμνξοπρσςτυφχψω",
 L"ΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΣΤΥΦΧΨΩ");
 EXPECT_EQ(tr_string(tr, L"Γεια σας"), L"ΓΕΙΑ ΣΑΣ");
}
#include <gtest/gtest.h>
#include <algorithm>
#include <string>
auto tr_string(auto const& tr, std::wstring s)
{
 std::ranges::transform(s, s.begin(), tr);
 return s;
}
TEST(tr, bad_args)
{
 EXPECT_THROW(make_substitutor(L"abcde", L"bcde"), std::invalid_argument);
}
TEST(tr, noop)
{
 auto const tr = make_substitutor(L"", L"");
 EXPECT_EQ(tr_string(tr, L""), L"");
 EXPECT_EQ(tr_string(tr, L"hello"), L"hello");
}
TEST(tr, english)
{
 auto const tr = make_substitutor(L"ehlo", L"ipza");
 EXPECT_EQ(tr_string(tr, L""), L"");
  EXPECT_EQ(tr_string(tr, L"hello"), L"pizza");
}
TEST(tr, greek)
{
 auto const tr = make_substitutor(L"αβγδεζηθικλμνξοπρσςτυφχψω",
 L"ΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΣΤΥΦΧΨΩ");
 EXPECT_EQ(tr_string(tr, L"Γεια σας"), L"ΓΕΙΑ ΣΑΣ");
}

Quick demo

$ ./wchar-tr αβγδεζηθικλμνξοπρσςτυφχψω ΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΣΤΥΦΧΨΩ <<<'Γεια σας'
ΓΕΙΑ ΣΑΣ
Source Link
Toby Speight
  • 87.7k
  • 14
  • 104
  • 325

Transliterate wide-character input

One shortcoming of GNU's implementation of the tr utility is that it operates on bytes rather than characters. As the man page says under BUGS:

Full support is available only for safe single-byte locales, in which every possible input byte represents a single character.

I've created a limited replacement that works with wide characters according to the inherited locale.

Known limitations

  • None of the options of tr are supported, such as -d to delete matched characters instead of replacing them, or -c to complement the match set. These two would be fairly easy to implement; the -s option to squeeze successive matches less so.
  • The "from" and "to" arguments must match in length, and there's no support for ranges, character classes or octal notation. Notably, this prevents us matching or substituting null characters.
  • Replacement is by wide-character code-point. So there's no normalisation of combining accents, and if your wide-character coding uses multi-char sequences (hello, UTF-16) then it's not usable for those.

These limitations are acceptable for my use-cases, so I don't need to change anything.


The code

I'll start with the core function:

#include <map>
#include <ranges>
#include <stdexcept>
#include <string_view>
#include <utility>
// Returns a function object that converts any character present in
// "from" to the corresponding character in "to".
auto make_substitutor(std::wstring_view from, std::wstring_view to)
{
 if (from.size() != to.size()) {
 throw std::invalid_argument("Replacement length mismatch");
 }
 auto map = std::views::zip(from, to) | std::ranges::to<std::map>();
 return [map=std::move(map)](wchar_t c) {
 auto it = map.find(c);
 return it == map.end() ? c : it->second;
 };
}

I have unit tests for it:

#include <gtest/gtest.h>
#include <algorithm>
#include <string>
auto tr_string(auto const& tr, std::wstring s)
{
 std::ranges::transform(s, s.begin(), tr);
 return s;
}
TEST(tr, bad_args)
{
 EXPECT_THROW(make_substitutor(L"abcde", L"bcde"), std::invalid_argument);
}
TEST(tr, abcde)
{
 auto const tr = make_substitutor(L"abcde", L"bcdea");
 EXPECT_EQ(tr_string(tr, L""), L"");
 EXPECT_EQ(tr_string(tr, L"fedcba"), L"faedcb");
}
TEST(tr, greek)
{
 auto const tr = make_substitutor(L"αβγδεζηθικλμνξοπρσςτυφχψω",
 L"ΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΣΤΥΦΧΨΩ");
 EXPECT_EQ(tr_string(tr, L"Γεια σας"), L"ΓΕΙΑ ΣΑΣ");
}

And a main() that makes it a complete program:

#include <algorithm>
#include <iostream>
#include <iterator>
#include <locale>
#include <cstdlib>
#include <cstring>
#include <print>
// Convert from platform string to wide string
static std::wstring arg_to_wstring(const char* arg)
{
 std::wstring s(std::strlen(arg), '0円'); // likely over-allocation
 auto len = std::mbstowcs(s.data(), arg, s.size());
 if (len == -1uz) {
 throw std::invalid_argument("Malformed multibyte string");
 }
 s.resize(len);
 return s;
}
int main(int argc, char **argv)
{
 if (!std::setlocale(LC_ALL, "")) {
 println(std::cerr, "Invalid locale settings");
 return EXIT_FAILURE;
 }
 if (argc != 3) {
 println(std::cerr, "Usage: {} from_chars to_chars", *argv ? *argv : "tr");
 return EXIT_FAILURE;
 }
 try {
 std::wstring const from = arg_to_wstring(argv[1]);
 std::wstring const to = arg_to_wstring(argv[2]);
 std::wcin.exceptions(std::ios::badbit | std::ios::failbit);
 std::wcout.exceptions(std::ios::badbit | std::ios::failbit);
 std::transform(std::istreambuf_iterator<wchar_t>(std::wcin),
 std::istreambuf_iterator<wchar_t>(),
 std::ostreambuf_iterator<wchar_t>(std::wcout),
 make_substitutor(from, to));
 } catch (std::exception& e) {
 println(std::cerr, "{}", e.what());
 return EXIT_FAILURE;
 }
}
lang-cpp

AltStyle によって変換されたページ (->オリジナル) /