I am writing an mathematical expression parser, and therefore I need fast way to edit any string in any imaginable way.
My current solution: (compilable code)
#include <iostream>
#include <string>
#include <algorithm>
#include <list>
#include <vector>
using namespace std;
class is_whitespace {
public:
inline bool operator() (const char & c)
{
return ( ' ' == c) || ('\n' == c) ||
('\r' == c) || ('\t' == c) ||
('\b' == c) || ('\v' == c) ||
('\f' == c);
}
};
inline void erase_whitespaces(string & expr) {
expr.erase(remove_if(expr.begin(), expr.end(), is_whitespace()), expr.end());
}
class FindDelim {
private:
string & out_delim;
public:
inline FindDelim(string & out) : out_delim(out) {}
public:
inline bool operator()(char & c, string & delim) {
if (string(&c, delim.size()) == delim) {
out_delim = delim;
return true;
}
return false;
}
};
inline void split_string(
vector<string> & output_str,
vector<string> & output_del,
const string::iterator & str_beg,
const string::iterator & str_end,
const vector<string>::iterator & del_beg,
const vector<string>::iterator & del_end)
{
string delim;
size_t delim_len;
string::iterator last = str_beg;
string::iterator next = str_beg;
while ((next = find_first_of(last, str_end, del_beg, del_end, FindDelim(delim))) != str_end) {
delim_len = delim.size();
output_str.push_back(string(&(*last), distance(last, next)));
output_del.push_back(string(&(*next), delim_len));
last = next + delim_len;
}
output_str.push_back(string(&(*last)));
}
inline void split_string(
vector<string> & output_str,
vector<string> & output_del,
const string::iterator & beg,
const string::iterator & end,
vector<string> & delims)
{
return split_string(output_str, output_del, beg, end, delims.begin(), delims.end());
}
inline void split_string(
vector<string> & output_str,
vector<string> & output_del,
const string::iterator & beg,
const string::iterator & end,
vector<string> && delims)
{
return split_string(output_str, output_del, beg, end, delims.begin(), delims.end());
}
inline void split_string(
vector<string> & output_str,
vector<string> & output_del,
string & expr,
vector<string> & delims)
{
return split_string(output_str, output_del, expr.begin(), expr.end(), delims.begin(), delims.end());
}
inline void split_string(
vector<string> & output_str,
vector<string> & output_del,
string & expr,
vector<string> && delims)
{
return split_string(output_str, output_del, expr.begin(), expr.end(), delims.begin(), delims.end());
}
inline void split_string(
vector<string> & output_str,
vector<string> & output_del,
string && expr,
vector<string> & delims)
{
return split_string(output_str, output_del, expr.begin(), expr.end(), delims.begin(), delims.end());
}
inline void split_string(
vector<string> & output_str,
vector<string> & output_del,
string && expr,
vector<string> && delims)
{
return split_string(output_str, output_del, expr.begin(), expr.end(), delims.begin(), delims.end());
}
int main()
{
string expr("[X] += 2 +たす 100 +たす 32 +たす 231 -ひく=わ 123 +たす 532");
// Erase white space:
erase_whitespaces(expr);
cout << expr << endl;
// Split expresion:
vector<string> splited_str;
vector<string> splited_del;
split_string(splited_str, splited_del, expr, { "+=", "-=" });
// Print result:
for (vector<string>::iterator it = splited_str.begin(); it != splited_str.end(); ++it) {
cout << *it << endl;
}
// Hold:
getchar();
}
- How can I improve this code in terms of performance?
- using
string(char *, size_t)
constructor to convertstring::iterator itr
(*itr
is achar
) hurts my eyes and soul... Is there any better way of performing this conversion? - I will want to switch output containers from
vector<string>
tolist<string>
(REASON: callinglist<string> myList; myList.insert()
will be faster.). - I know that
FindDelim()
function object will perform out-of-bounds read, at the end of the string. But I think I can live with it. (prove me wrong?)
Function split_string()
in the example has been called in a loop until the given time duration has passed. Measurement result is average value from every call.
measurement duration: 180 [sec]
measurement result 1: 1895.690 [ns]
measurement result 2: 1878.571 [ns]
-
2\$\begingroup\$ Use flex/bison to write a real expression parser. \$\endgroup\$Loki Astari– Loki Astari2017年05月01日 16:54:32 +00:00Commented May 1, 2017 at 16:54
-
1\$\begingroup\$ @LokiAstari Where's the challenge in that? \$\endgroup\$kyrill– kyrill2017年05月01日 17:14:05 +00:00Commented May 1, 2017 at 17:14
-
3\$\begingroup\$ If you are going to be repeatedly editing this presumably long string then you would want to use a data structure called a rope. \$\endgroup\$Emily L.– Emily L.2017年05月01日 17:31:17 +00:00Commented May 1, 2017 at 17:31
-
\$\begingroup\$ I disagree with @LokiAstari; if you want to write a correct expression parser, I don't think flex/bison is up to the task. Standard infix expression notation deals with contextually-sensitive operator precedence, operators with differing associativity, and a whole mess of other problems. Source: I wrote one \$\endgroup\$Dave DeLong– Dave DeLong2017年05月01日 17:58:25 +00:00Commented May 1, 2017 at 17:58
-
2\$\begingroup\$ @DaveDeLong flex/bison deal with these issues and a whole bunch more very easily and in a fraction of the space you have used for your linked parser. A lesson in using the correct tools for the job. \$\endgroup\$Loki Astari– Loki Astari2017年05月01日 18:05:39 +00:00Commented May 1, 2017 at 18:05
1 Answer 1
You probably don't need to modify the input string. Assuming that whitespace is allowed only between tokens (i.e. that user's aren't expected to write 532
as 5 32
or +=
as + =
), then each token can be returned as a std::string_view
- provided you're willing to keep the string alive for as long as you use any of the views.
Prefer to use const_iterator
types when you won't be modifying the values; I think that all the iterators in split_string()
can be const iterators if you change the signature of FindDelim::operator()
to accept its arguments as reference to const:
bool FindDelim::operator()(const char & c, const string & delim)
{
if (string(&c, delim.size()) == delim) {
out_delim = delim;
return true;
}
return false;
}
inline void split_string(
vector<string> & output_str,
vector<string> & output_del,
const string::const_iterator & str_beg,
const string::const_iterator & str_end,
const vector<string>::const_iterator & del_beg,
const vector<string>::const_iterator & del_end)
{
string delim;
size_t delim_len;
string::const_iterator last = str_beg;
string::const_iterator next = str_beg;
while ((next = find_first_of(last, str_end, del_beg, del_end, FindDelim(delim))) != str_end) {
delim_len = delim.size();
output_str.push_back(string(&(*last), distance(last, next)));
output_del.push_back(string(&(*next), delim_len));
last = next + delim_len;
}
output_str.push_back(string(&(*last)));
}
inline void split_string(
vector<string> & output_str,
vector<string> & output_del,
const string::const_iterator & beg,
const string::const_iterator & end,
vector<string> & delims)
{
split_string(output_str, output_del, beg, end, delims.begin(), delims.end());
}
inline void split_string(
vector<string> & output_str,
vector<string> & output_del,
const string::const_iterator & beg,
const string::const_iterator & end,
vector<string> && delims)
{
split_string(output_str, output_del, beg, end, delims.begin(), delims.end());
}
Consider providing an input iterator interface for your tokeniser. That will enable consumers to gather the tokens into a standard collection, or to use them in streaming fashion without unnecessary overhead.
You should understand the implications of using namespace std;
. I would advise you to steer clear of it at file scope.
(削除) You assign to FindDelim::out_delim
but never read from it - that suggests that it can be removed without changing the meaning. (削除ここまで)
The FindDelim::out_delim
member is a reference that allows modification of non-owned objects. To me, I find that such references weaken the encapsulation of the class, and I prefer to avoid them (as evidenced by the fact I needed to edit my answer here). Perhaps there's a way to limit the action-at-a-distance that's easily overlooked. (I don't have a concrete suggestion here; perhaps some other answerer will propose an improvement).
I suggest using standard predicates instead of writing your own - is_whitespace
is better written as std::isspace
(you'll need to include <cctype>
, of course) unless you really expect backspace in the input.
Although you may return
void-expression;
from a function that returns void, I would reserve this for use only in template functions that may or may not return void depending on their instantiation - not for functions that always return void.
Finally, please ditch the pointless read at the end of main()
- it caused my run to hang waiting for input (which it won't get, in Emacs's compilation buffer...). It's not portable to rely on it without including <cstdio>
, anyway.
Sample code
Here's an approach using std::regex
that addresses the points above. Its performance can obviously be improved, but I'd recommend doing that by swapping out the regex for a custom parser without changing the interface. You can see how the iterators provide flexibility and allow the use of standard algorithms:
#include <algorithm>
#include <iostream>
#include <iterator>
#include <regex>
#include <string>
#include <vector>
int main()
{
const std::string expr{"[X] += 2 +たす 100 +たす 32 +たす 231 -ひく=わ 123 +たす 532"};
const std::regex separator{"\\+=|-="};
std::vector<std::string> tokens;
std::copy(std::sregex_token_iterator(expr.begin(), expr.end(), separator, -1),
std::sregex_token_iterator(),
std::back_inserter(tokens));
// Remove spaces
static int (*const is_space)(int) = std::isspace;
for (auto& s: tokens)
s.erase(std::remove_if(s.begin(), s.end(), is_space), s.end());
// Print result:
std::copy(tokens.begin(), tokens.end(),
std::ostream_iterator<std::string>(std::cout, "\n"));
}
I verified that the output is identical to your original.
-
\$\begingroup\$ "You assign to FindDelim::out_delim but never read from it - that suggests that it can be removed without changing the meaning." I am reading from it. Look at
line 50
. I am creating there astring delin
Next look atline 55
I am passing that string through reference as constructor parameter. After that, inFindDelim
function object I am comparing two strings; first is a fragment of input expression, second is currently examined delimiter. If match is found, delimiter is copied to theout_delim
, and thereforeout_Delim
is a reference, it can be accessed inside higher scope. \$\endgroup\$PatrykB– PatrykB2017年05月01日 15:44:30 +00:00Commented May 1, 2017 at 15:44 -
\$\begingroup\$ Ah, sorry - I missed that it is a reference member! (I tend to avoid modifiable reference members in my own code - they tend to hide unexpected action, as this did for me!). \$\endgroup\$Toby Speight– Toby Speight2017年05月01日 16:23:13 +00:00Commented May 1, 2017 at 16:23