In this repo I've put together a header only string splitter, allowing for characters and string literals as delimiters.
The (little) library is strictly C++17.
I would like to ask for your comments.
I would also like the code to be as short as possible, so any comments in that direction are also most welcome.
As it appears to be mandatory to include at least 3 lines of code, here's the code:
// MIT License
//
// Copyright (c) 2019 degski
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#pragma once
#include <algorithm>
#include <iostream>
#include <string>
#include <string_view>
#include <type_traits>
#include <vector>
std::ostream & nl ( std::ostream & out_ ) { return out_ << '\n'; }
std::wostream & nl ( std::wostream & out_ ) { return out_ << L'\n'; }
namespace sax::detail {
template<typename CharT>
[[ nodiscard ]] constexpr std::basic_string_view<CharT> make_string_view ( std::basic_string_view<CharT> x ) noexcept {
return x; // guaranteed copy elision.
}
template<typename CharT>
[[ nodiscard ]] constexpr std::basic_string_view<CharT> make_string_view ( CharT x ) noexcept {
return std::basic_string_view<CharT> ( std::addressof ( x ), 1 );
}
template<typename CharT>
[[ nodiscard ]] constexpr std::basic_string_view<CharT> make_string_view ( const CharT * x ) noexcept {
return std::basic_string_view<CharT> ( x );
}
template<typename CharT>
constexpr void remove_prefix ( std::basic_string_view<CharT> & s, bool & removed, std::basic_string_view<CharT> x ) noexcept {
// This bit will come with C++20.
if ( s.size ( ) >= x.size ( ) and s.compare ( 0, x.size ( ), x ) == 0 ) {
s.remove_prefix ( x.size ( ) );
removed = removed or true;
};
}
template<typename CharT>
constexpr void remove_prefix ( std::basic_string_view<CharT> & s, bool & removed, CharT x ) noexcept {
if ( s.size ( ) >= 1 and s [ 0 ] == x ) {
s.remove_prefix ( 1 );
removed = removed or true;
};
}
template<typename CharT>
constexpr void remove_prefix ( std::basic_string_view<CharT> & s, bool & removed, const CharT * x ) noexcept {
remove_prefix ( s, removed, std::basic_string_view<CharT> ( x ) );
}
template<typename CharT, typename ... Args>
constexpr void remove_prefix ( std::basic_string_view<CharT> & s_, Args ... args_ ) noexcept {
bool removed = false;
do {
removed = false;
( remove_prefix ( s_, removed, std::forward<Args> ( args_ ) ), ... );
} while ( removed ); // Keep removing untill nothing more can be removed.
}
template<typename CharT>
constexpr void remove_suffix ( std::basic_string_view<CharT> & s, bool & removed, std::basic_string_view<CharT> x ) noexcept {
// This bit will come with C++20.
if ( s.size ( ) >= x.size ( ) and s.compare ( s.size ( ) - x.size ( ), std::basic_string_view<CharT>::npos, x ) == 0 ) {
s.remove_suffix ( x.size ( ) );
removed = removed or true;
};
}
template<typename CharT>
constexpr void remove_suffix ( std::basic_string_view<CharT> & s, bool & removed, CharT x ) noexcept {
remove_suffix ( s, removed, std::basic_string_view<CharT> ( std::addressof ( x ), 1 ) );
}
template<typename CharT>
constexpr void remove_suffix ( std::basic_string_view<CharT> & s, bool & removed, const CharT * x ) noexcept {
remove_suffix ( s, removed, std::basic_string_view<CharT> ( x ) );
}
template<typename CharT, typename ... Args>
constexpr void remove_suffix ( std::basic_string_view<CharT> & s_, Args ... args_ ) noexcept {
bool removed = false;
do {
removed = false;
( remove_suffix ( s_, removed, std::forward<Args> ( args_ ) ), ... );
} while ( removed ); // Keep removing untill nothing more can be removed.
}
template<typename CharT, typename SizeT, typename StringyThing>
constexpr void find ( std::basic_string_view<CharT> & s, SizeT & f_, StringyThing x_ ) noexcept {
f_ = std::min ( s.find ( make_string_view<CharT> ( x_ ) ), f_ );
}
template<typename CharT, typename ... Args>
[[ nodiscard ]] constexpr auto find ( std::basic_string_view<CharT> & s_, Args ... args_ ) noexcept {
auto found = std::basic_string_view<CharT>::npos;
( find ( s_, found, std::forward<Args> ( args_ ) ), ... );
return found;
}
}
namespace sax {
template<typename CharT, typename ... Delimiters>
[[ nodiscard ]] std::vector<std::basic_string_view<CharT>> string_split ( const std::basic_string<CharT> & string_, Delimiters ... delimiters_ ) {
using size_type = typename std::basic_string_view<CharT>::size_type;
std::basic_string_view<CharT> string_view ( string_ );
std::vector<std::basic_string_view<CharT>> string_view_vector;
string_view_vector.reserve ( 4 ); // Avoid small size re-allocating, 0 > 1 > 2 > 3 > 4 > 6, now 4 > 6 > 9 etc.
// Remove trailing delimiters.
detail::remove_suffix ( string_view, std::forward<Delimiters> ( delimiters_ ) ... );
// Parse the string_view left to right.
while ( true ) {
detail::remove_prefix ( string_view, std::forward<Delimiters> ( delimiters_ ) ... );
const size_type pos = detail::find ( string_view, std::forward<Delimiters> ( delimiters_ ) ... );
if ( std::basic_string_view<CharT>::npos == pos ) {
string_view_vector.emplace_back ( std::move ( string_view ) );
break;
}
string_view_vector.emplace_back ( string_view.data ( ), pos );
string_view.remove_prefix ( pos );
}
return string_view_vector;
}
}
Used like so:
// MIT License
//
// Copyright (c) 2019 degski
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#include <cassert>
#include <cstddef>
#include <cstdint>
#include <cstdlib>
#include <array>
#include <filesystem>
#include <fstream>
#include <iostream>
#include <iterator>
#include <list>
#include <map>
#include <random>
#include <string>
#include <type_traits>
#include <vector>
namespace fs = std::filesystem;
#include <string_split.hpp>
template<typename Stream, typename Container>
Stream & operator << ( Stream & out_, const Container & s_ ) noexcept {
for ( const auto & v : s_ )
out_ << '\"' << v << "\" ";
out_ << '\b';
return out_;
}
int main ( ) {
std::string s ( " , \t the quick brown ,, ,fox jumps underover \t , the lazy dog ," );
std::cout << sax::string_split ( s, " ", ',', "\t", "under" ) << nl;
return EXIT_SUCCESS;
}
Output:
"the" "quick" "brown" "fox" "jumps" "over" "the" "lazy" "dog"
What the code does (is supposed to do): Remove from the string any delimiters passed in, doing that left to right, applying the delimiters left to right and return the now separate bits left as a vector of string_view's over the original string. The latter means the string has to outlive the vector of string views.
The above does means that depending on what kind of delimiters you put (as they can be strings, which can interact with each other), that the order of the delimiters has significance.
The nl-code was originally in another header part of a bigger little lib, but due to the requirement all should be self-contained is just thrown in there.
removed = removed or true
can/should be removed = true
.
1 Answer 1
I don't always follow this advice myself, but it's a good idea to write
a specification for the function/algorithm you're implementing. (See
this
post by Eric Lippert for more information). That'll also make it easier
to come up with test cases and reason about various corner cases (like
what's the behavior if I try do split("aaba", "a", "ab")
for
instance?).
In this case we might go for something like: Given a string_view
and a
list (argument list) of "string like" delimiters return a std::vector
of string_views representing the text between delimiters (using the
first matching delimiter). If no delimiters are found return a vector
with one element consisting of the input string.
I mention this because at the moment it's a bit hard to see the overall idea behind your code. It's not totally clear to me what it's doing, I can see it's removing suffixes and prefixes, but it's hard to verify that the main loop always terminates for instance. I think if you'd written an informal specification beforehand you'd probably have ended up with a more structured and easier to understand main loop.
Here's a list of some other things I noticed while reading the code (some are from my comment):
make_string_view( CharT X )
returns a pointer to a temporary. You can make astring_view
that way from the arguments, but you need pass the argument by reference all the way.removed = removed or true
should just beremoved = true
nl
doesn't belong in that header- I'd put all internal functions into some kind of
private
/internal
/detail
namespace - I personally only use
[[nodiscard]]
on functions where it's a big mistake to not use the return value (I don't thinkmake_string_view
qualifies) - You're missing
&&
on the delimiters forstd::forward
to do its magic - You probably only want to create
string_view
s from the delimiter arguments once
Given all of the above I image you could implement the function something like this (in pseudo-code):
vector<string_view<CharT>> split(string_view<CharT> str, Delimiters&& delims...) {
if (str.empty()) return { str }; // Get special case out of the way
vector<string_view<CharT>> res;
size_t last_index = 0;
const auto delim_array = make_delim_array(std::forward<Delimiters>(delims)...);
for (size_t i = 0; i < str.length();) {
if (auto match_length = any_matches(str, i, delim_array)) {
res.push_back(str.substr(last_index, i - last_index));
i += match_length;
last_index = i;
} else {
++i;
}
}
res.push_back(str.substr(last_index));
return res;
}
-
\$\begingroup\$
make_string_view( CharT X )
, yes I had/have trouble with that, the other functions (remove_prefix and remove_suffix) had it as well, but didn't work [because of that what you say]. I guess in the one use case that's left, RVO makes it work, it [the function] has to go. The support functions are insax::detail
. I'll re-gig the code some (taking cues from the pseudo-code) and add the empty check, Thanks! (Some of your other concerns have been addressed in an addendum to the question. \$\endgroup\$degski– degski2019年03月02日 04:16:07 +00:00Commented Mar 2, 2019 at 4:16 -
\$\begingroup\$
You're missing && on the delimiters for std::forward to do its magic
, I don't get (as in understand) the idea of moving from the delimiters and at the same time maintaining astd::string_view
on them. Could you please help me understand what's going on? \$\endgroup\$degski– degski2019年03月02日 04:51:53 +00:00Commented Mar 2, 2019 at 4:51 -
\$\begingroup\$ I meant that "Delimiters" in the argument list needs to be "Delimiters&&" (like in my pseudo-code) for perfect forwarding to work. BUT having though a little more about it, I think here you should just take the arguments by const reference (since you won't be "consuming" any of delimiter arguments). \$\endgroup\$user786653– user7866532019年03月02日 09:08:27 +00:00Commented Mar 2, 2019 at 9:08
-
\$\begingroup\$ ... by const reference (since you won't be "consuming" any of delimiter arguments), exactly, that was what I was alluding too (but not sure) in my comment, you don't want to forward them (otherwise) certainly the char won't work. \$\endgroup\$degski– degski2019年03月02日 10:54:28 +00:00Commented Mar 2, 2019 at 10:54
-
\$\begingroup\$ Why special-case an empty
str
? \$\endgroup\$Deduplicator– Deduplicator2019年03月02日 17:06:40 +00:00Commented Mar 2, 2019 at 17:06
make_string_view ( CharT x )
returning a pointer to a temporary (the argument)?removed = removed or true
should just beremoved = true
,nl
doesn't belong in the header as far as I can tell, and I'm pretty sure it can be made smaller, but it's a bit hard to comment more specifically without a set of test-cases and/or description of what's expected to be returned in various corner cases. \$\endgroup\$StrSplit()
)? \$\endgroup\$