A follow-up for this previous question.
I took into account previous reviews, and tried to make a simple API. I had never done anything non-trivial with C++20 concepts and ranges until now, so I am sure I have a lot of things to fix. I added many helper concepts. Interestingly, while there are various iterator and range concepts in the standard library, there is no way to specify "iterator for int" or "range of int". So, many of my concepts try to implement that.
The general design is that you can view a UTF-8 range as a range of code-points using the from_utf8_range
class. And, you can write code points to a UTF-16 range using the to_utf16_iter
class. To convert between ranges, you can use std::ranges::copy
from standard library. to_utf16
is a convenience function to do that.
Code
#include <stddef.h>
#include <iterator>
#include <ranges>
#include <stdexcept>
#include <type_traits>
namespace unic {
/// Some helper concept-combinations
/// ..._for<T1, T2> -> T1 contains the T2
template <class Iter, class Type>
concept iterator_for =
::std::same_as<::std::decay_t<::std::iter_value_t<Iter>>, Type>;
template <class Iter, class Type>
concept forward_iterator_for =
::std::forward_iterator<Iter> && iterator_for<Iter, Type>;
template <class Iter, class Type>
concept input_iterator_for =
::std::input_iterator<Iter> && iterator_for<Iter, Type>;
template <class Range, class Type>
concept range_for = ::std::ranges::range<Range> &&
::std::same_as<::std::decay_t<::std::ranges::range_value_t<Range>>, Type>;
template <class Range, class Type>
concept sized_range_for =
::std::ranges::sized_range<Range> && range_for<Range, Type>;
template <class SizedInputRange, class Type>
concept sized_input_range_for = sized_range_for<SizedInputRange, Type> &&
::std::ranges::input_range<SizedInputRange>;
template <class SizedInputRange, class Type>
concept sized_forward_range_for = sized_range_for<SizedInputRange, Type> &&
::std::ranges::forward_range<SizedInputRange>;
template <class InputRange, class Type>
concept input_range_for =
::std::ranges::input_range<InputRange> && range_for<InputRange, Type>;
// Exception classes
struct utf_error : ::std::runtime_error {
utf_error(::std::string const& msg) : ::std::runtime_error(msg) {}
};
// This exception contains the point where error happened
template <class src_iter>
struct utf_positioned_error : utf_error {
[[no_unique_address]] src_iter error_position{};
utf_positioned_error(src_iter err_pos, ::std::string const& msg)
: error_position(::std::move(err_pos)), utf_error(::std::move(msg)) {}
};
// utf8 to code points
template <forward_iterator_for<char8_t> src_iter,
::std::sized_sentinel_for<src_iter> src_end_iter>
class from_utf8_range final {
private:
[[no_unique_address]] src_iter m_begin{};
[[no_unique_address]] src_end_iter m_end{};
public:
struct iterator final // forward_iterator
{
private:
friend class from_utf8_range;
[[no_unique_address]] src_iter m_begin{};
[[no_unique_address]] src_end_iter m_end{};
constexpr iterator(src_iter begin, src_end_iter end) noexcept
: m_begin(::std::move(begin)), m_end(::std::move(end)) {}
// returns -1 on fail
[[nodiscard]] static constexpr int compute_byte_count(
char8_t const header) noexcept {
auto const cnt = ::std::countl_one(static_cast<unsigned char>(header));
if (cnt == 0)
return 1;
if (cnt < 2 || cnt > 6)
return -1;
return cnt;
}
public:
iterator() = default; // required for the iterator concept apparently
using iterator_category = ::std::forward_iterator_tag;
using difference_type = ::std::ptrdiff_t;
using value_type = char32_t;
[[maybe_unused]] constexpr auto operator++() -> iterator& {
auto const cnt = compute_byte_count(*m_begin);
if (cnt == -1 || cnt > m_end - m_begin)
throw utf_positioned_error(m_begin, "Length in header byte is wrong");
::std::advance(m_begin, cnt);
return *this;
}
[[nodiscard]] constexpr auto operator++(int) -> iterator {
auto copy = *this;
++*this;
return copy;
}
[[nodiscard]] constexpr auto operator*() const -> char32_t {
auto const cnt = compute_byte_count(*m_begin);
if (cnt == -1 || cnt > m_end - m_begin)
throw utf_positioned_error(m_begin, "Length in header byte is wrong");
char32_t code_point = 0;
if (cnt == 1) // ascii
code_point = *m_begin;
else {
// extract trailing bits
auto begin = m_begin;
code_point = static_cast<char32_t>(
*begin++ & (static_cast<char8_t>(~0u) >> (cnt + 1)));
// extract rest of the bytes
for (int i = 1; i < cnt; ++i) {
if (*begin < 0x80 || 0xBF < *begin)
throw utf_positioned_error(begin, "Illegal trail byte");
code_point = (code_point << 6) | (*begin & 0x3F);
++begin;
}
}
return code_point;
}
[[nodiscard]] constexpr auto operator==(
iterator const& other) const noexcept -> bool {
return m_begin == other.m_begin;
}
[[nodiscard]] constexpr auto operator!=(
iterator const& other) const noexcept -> bool {
return !(*this == other);
}
};
public:
constexpr from_utf8_range(src_iter begin, src_end_iter end) noexcept
: m_begin(::std::move(begin)), m_end(::std::move(end)) {}
template <sized_range_for<char8_t> u8range>
constexpr from_utf8_range(u8range const& range) noexcept
: from_utf8_range(::std::begin(range), ::std::end(range)) {}
// all iterators are const
[[nodiscard]] constexpr auto begin() const noexcept {
return iterator(m_begin, m_end);
}
[[nodiscard]] constexpr auto end() const noexcept {
return iterator{m_end, m_end};
}
[[nodiscard]] constexpr auto cbegin() const noexcept { return begin(); }
[[nodiscard]] constexpr auto cend() const noexcept { return end(); }
};
template <sized_range_for<char8_t> u8range>
from_utf8_range(u8range const& range) noexcept
->from_utf8_range<::std::decay_t<decltype(::std::ranges::begin(range))>,
::std::decay_t<decltype(::std::ranges::end(range))>>;
namespace detail {
// Outputs a code-point to a UTF-16 output iterator
template <::std::output_iterator<char16_t> out_iter>
[[maybe_unused]] constexpr int append(out_iter out, char32_t code_point) {
if (code_point <= 0xFFFF) {
*out++ = static_cast<char16_t>(code_point);
return 1;
} else if (code_point <= 0x10FFFF) {
code_point -= 0x10000;
*out++ = static_cast<char16_t>((code_point >> 10) + 0xD800);
*out++ = static_cast<char16_t>((code_point & 0x3FF) + 0xDC00);
return 2;
} else {
throw utf_positioned_error(out, "Out of UTF-16 range");
}
}
} // namespace detail
// Code points to utf16
template <::std::output_iterator<char16_t> out_iter>
class to_utf16_iter final {
private:
[[no_unique_address]] out_iter m_iter{};
struct proxy_assigner {
private:
friend class to_utf16_iter;
[[no_unique_address]] to_utf16_iter* m_parent;
proxy_assigner(to_utf16_iter* parent) noexcept : m_parent{parent} {}
proxy_assigner(::std::nullptr_t) = delete;
public:
[[maybe_unused]] constexpr auto operator=(char32_t const code_point) const
-> proxy_assigner const& {
auto const unit_cnt = detail::append(m_parent->m_iter, code_point);
::std::advance(m_parent->m_iter, unit_cnt);
return *this;
}
template <class T>
proxy_assigner& operator=(T) const = delete;
};
public:
to_utf16_iter(out_iter iter) noexcept : m_iter(::std::move(iter)) {}
to_utf16_iter() = default;
using iterator_category = ::std::output_iterator_tag;
using difference_type = ptrdiff_t;
// Incrementing is no-op for output iterators. Actual incrementing is done on
// write
[[maybe_unused]] constexpr auto operator++() noexcept -> to_utf16_iter& {
return *this;
}
[[nodiscard]] constexpr auto operator++(int) noexcept -> to_utf16_iter {
return *this;
}
[[nodiscard]] constexpr auto operator*() noexcept -> proxy_assigner {
return {this};
}
};
template <input_iterator_for<char32_t> input_beg,
::std::sentinel_for<input_beg> input_end>
constexpr ptrdiff_t to_utf16_size(input_beg beg, input_end const end) {
ptrdiff_t size = 0;
for (; beg != end; ++beg) {
auto const code_point = *beg;
if (code_point <= 0xFFFF)
size += 1;
else if (code_point <= 0x10FFFF)
size += 2;
else
throw utf_positioned_error(beg, "Out of UTF-16 range");
}
return size;
}
template <input_range_for<char32_t> u32range>
constexpr ptrdiff_t to_utf16_size(u32range const& range) {
return to_utf16_size(::std::ranges::begin(range), ::std::ranges::end(range));
}
template <forward_iterator_for<char8_t> input_beg,
::std::sized_sentinel_for<input_beg> input_end>
constexpr ptrdiff_t to_utf16_size(input_beg beg, input_end const end) {
return to_utf16_size(from_utf8_range{beg, end});
}
template <sized_forward_range_for<char8_t> u8range>
constexpr ptrdiff_t to_utf16_size(u8range const& range) {
return to_utf16_size(from_utf8_range{range});
}
template <forward_iterator_for<char8_t> u8beg,
::std::sized_sentinel_for<u8beg> u8end,
::std::output_iterator<char32_t> code_point_out>
constexpr void to_utf32(u8beg beg, u8end end, code_point_out out) {
::std::ranges::copy(from_utf8_range{beg, end}, out);
}
template <forward_iterator_for<char8_t> u8beg,
::std::sized_sentinel_for<u8beg> u8end,
::std::output_iterator<char16_t> code_point_out>
constexpr void to_utf16(u8beg beg, u8end end, code_point_out out) {
to_utf32(beg, end, to_utf16_iter{out});
}
template <sized_input_range_for<char8_t> u8range,
::std::output_iterator<char32_t> code_point_out_iter>
constexpr void to_utf32(u8range const& range, code_point_out_iter out) {
from_utf8_range rng{range};
to_utf32(::std::ranges::begin(rng), ::std::ranges::end(rng), out);
}
template <sized_input_range_for<char8_t> u8range,
::std::output_iterator<char16_t> code_point_out>
constexpr void to_utf16(u8range const& range, code_point_out out) {
to_utf16(::std::ranges::begin(range), ::std::ranges::end(range), out);
}
} // namespace unic
Basic usage
int main() {
char8_t constexpr u8[] =
u8"\U0001F449 \U00002728\u7EDD\u4E0D\u4F1A\u653E\u5F03\u4F60\U00002728 "
u8"\U0001F448";
try {
auto constexpr u16_size = unic::to_utf16_size(u8);
char16_t output[u16_size]; // constexpr size
unic::to_utf16(u8, output);
} catch (::std::exception const& ex) {
puts(ex.what());
return -1;
}
}
2 Answers 2
Looks like you really embraced it! Good show! I just have a few simple notes:
Being C++20, you don't need to define operator!=
as well if operator==
is defined.
::std::begin(range), ::std::end(range)
Remember the "std 2-step". Just use std::range::begin
etc. to use the new ones that don't have these issues. I see you used those elsewhere, so you might have just missed some occurrences.
// all iterators are const
No, you call operator++
on the iterator, can assign to one, etc.
The various concepts you define ought to go into a nested detail
namespace, as they pollute your official API. Or maybe you want those in your own different namespace for general reuse separate from this project.
-
\$\begingroup\$ I meant to say that they are all const-iterators, as in you can't modify the element they "point" to. \$\endgroup\$Aykhan Hagverdili– Aykhan Hagverdili2021年06月07日 15:06:41 +00:00Commented Jun 7, 2021 at 15:06
-
1\$\begingroup\$ You could call the nested class
const_iterator
, matching the names as used in collection class templates. \$\endgroup\$JDługosz– JDługosz2021年06月07日 15:22:56 +00:00Commented Jun 7, 2021 at 15:22
It's better to include <cstddef>
than the deprecated <stddef.h>
in C++ code (that ensures the definitions are in the std
namespace). Ironically, the code proceeds to use std::nullptr_t
despite the fact that the C header makes no guarantee of also defining such identifiers. The use of ptrdiff_t
will need updating, though.
I think it's overkill to write ::std::
rather than just std::
, where we know we won't define an inner namespace with that name.
auto const cnt
We're allowed to use vowels in our identifiers! And doing so avoids ambiguity...
-
\$\begingroup\$ Thank you for the review! I recently saw a new proposal to un-depricate
.h
headers. I should have gone with<cstddef>
anyway. \$\endgroup\$Aykhan Hagverdili– Aykhan Hagverdili2021年06月05日 20:51:29 +00:00Commented Jun 5, 2021 at 20:51 -
\$\begingroup\$ I have been using
cnt
to mean count for as long as I remember. I should probably start getting used tocount
for clarity's sake. \$\endgroup\$Aykhan Hagverdili– Aykhan Hagverdili2021年06月05日 20:52:34 +00:00Commented Jun 5, 2021 at 20:52 -
\$\begingroup\$ The abbreviation jarred, particularly as it was out of step with the rest of the code. Sorry I didn't have time for a more in-depth review! \$\endgroup\$Toby Speight– Toby Speight2021年06月06日 06:59:41 +00:00Commented Jun 6, 2021 at 6:59
-
\$\begingroup\$ Yea, the great vowel shortage ended many years ago. \$\endgroup\$JDługosz– JDługosz2021年06月07日 13:36:45 +00:00Commented Jun 7, 2021 at 13:36
-
1\$\begingroup\$ Well, some of the
cnt
you could have meant might be better off asr
forresult
(if I declare a local for the result, that's my preference) orn
(if the scope is small enough and there is no similar variable resp. no equally good use forn
). Conciseness is good as long as it supports or at least doesn't hinder readability. \$\endgroup\$Deduplicator– Deduplicator2021年06月07日 14:49:50 +00:00Commented Jun 7, 2021 at 14:49
int
, but will be collections ofT
(another template argument) or some concept describing the requirements of that type; or to specify that iterators are to the same type that is not itself present as an argument. \$\endgroup\$std::span
for a continuous range of a specific type. It would nice if we could have a similar concept. You're probably right that there isn't enough "demand" though. Defining those for your project is simple enough that I don't think we should complicate the standard library by adding more "stuff" either. \$\endgroup\$begin
andend
are iterators to the same thing (or iterator and sentinel) and apply any concept to thevalue_type
? There's certainly some common constraints that apply to many algorithms, so are they effectively packaged? \$\endgroup\$sentinel_for
everywhere. That's also where I borrowed the naming convention. \$\endgroup\$indirectly_copyable
. But none provide the hard constraint I want: "A range of char8_t elements". I don't want people to pass vaguely related ranges. Like, someone could pass in a range ofint
s sinceint
is convertible tochar8_t
. \$\endgroup\$