I am rewriting Peter Norvig's Python regex golf solver in C++. I had originally planned to post the whole thing here at once when I was finished, but:
- This program takes double the time of the Python version on the driver input shown in main(). And that's just to complete all the functions I have so far: in half the time, the Python program is able to generate solutions both ways. C++ is supposed to be faster than Python; is there a way to speed this up?
- The code is starting to smell in general and it needs a checkup.
Here is what I have so far:
#include <algorithm>
#include <iostream>
#include <map>
#include <regex>
#include <set>
#include <string>
#include <sstream>
#include <utility>
#include <vector>
using std::endl;
typedef std::pair<std::string, std::set<std::string>> covers_pair;
typedef std::map<std::string, std::set<std::string>> regex_covers_t;
std::string replacements(char c)
{
return c == '^' || c == '$'? std::string{c} : std::string{c} + '.';
}
std::set<std::string> subparts(const std::string& word, size_t subpart_size=5)
{
std::set<std::string> subparts;
for (size_t i = 0; i < word.size(); i++) {
for (size_t s = 0; s < subpart_size; s++) {
subparts.insert(word.substr(i, s + 1));
}
}
return subparts;
}
std::set<std::string> dotify(const std::string& part)
{
if (part.empty()) {
return std::set<std::string>{""};
}
auto result = std::set<std::string>();
for (const auto& rest: dotify(part.substr(1))) {
for (const auto& c: replacements(part.front())) {
result.insert(c + rest);
}
}
return result;
}
const auto rep_chars = {'*', '+', '?'};
std::set<std::string> repetitions(const std::string& part)
{
const auto dots = std::string("..");
auto repetitions = std::set<std::string>();
for (size_t i = 1, limit = part.size(); i <= limit; i++) {
auto A = part.substr(0, i);
auto B = part.substr(i);
bool valid_last_char = A.back() != '^' && A.back() != '$';
bool no_adjoining_dots = !(A.back() == '.' && B.front() == '.');
if (valid_last_char && no_adjoining_dots &&
!std::equal(dots.crbegin(), dots.crend(), A.crbegin()))
{
for (const auto& q: rep_chars) {
repetitions.insert(A + q + B);
}
}
}
return repetitions;
}
std::string join(
const std::set<std::string> container, const std::string& delim)
{
if (container.empty()) {
return "";
}
std::ostringstream joiner;
std::string joined_string;
std::copy(
container.cbegin(), container.cend(),
std::ostream_iterator<std::string>(joiner, delim.c_str()));
joined_string = joiner.str();
// The std::copy call above will append the delimiter to the end, so
// we now remove it.
joined_string.erase(
joined_string.end() - delim.size(), joined_string.end());
return joined_string;
}
std::set<std::string> create_parts_pool(const std::set<std::string>& winners)
{
auto wholes = std::set<std::string>();
for (const auto& winner: winners) {
wholes.insert('^' + winner + '$');
}
auto parts = std::set<std::string>();
for (const auto& whole: wholes) {
for (const auto& subpart: subparts(whole)) {
for (const auto& dotified: dotify(subpart)) {
parts.insert(dotified);
}
}
}
auto winners_str = join(winners, "");
auto charset = std::set<char>(winners_str.begin(), winners_str.end());
auto chars = std::set<std::string>();
for (const auto& c: charset) {
chars.emplace(1, c);
}
auto pairs = std::set<std::string>();
for (const auto& A: chars) {
for (const auto& B: chars) {
for (const auto& q: rep_chars) {
pairs.insert(A + '.' + q + B);
}
}
}
auto reps = std::set<std::string>();
for (const auto& part: parts) {
for (const auto& repetition: repetitions(part)) {
reps.insert(repetition);
}
}
std::set<std::string> pool;
for (auto set: {wholes, parts, chars, pairs, reps}) {
std::set_union(
pool.begin(), pool.end(), set.begin(), set.end(),
std::inserter(pool, pool.begin()));
}
return pool;
}
regex_covers_t regex_covers(
const std::set<std::string>& winners,
const std::set<std::string>& losers)
{
const auto& pool = create_parts_pool(winners);
regex_covers_t covers;
for (const auto& part: pool) {
auto re = std::regex(part);
bool matched_loser = false;
for (const auto& loser: losers) {
if (regex_search(loser, re)) {
matched_loser = true;
break;
}
}
if (matched_loser) {
continue;
}
std::set<std::string> matched_winners;
for (const auto& winner: winners) {
if (std::regex_search(winner, re)) {
matched_winners.insert(winner);
}
}
covers[part] = matched_winners;
}
return covers;
}
std::vector<std::string> create_sorted_parts_list(const regex_covers_t& covers)
{
std::vector<std::string> sorted_parts;
for (const covers_pair& pair: covers) {
sorted_parts.push_back(pair.first);
}
auto sort_lambda = [&covers](const std::string& r1, const std::string& r2)
{
auto r1_pair = std::make_pair(
-static_cast<int>(covers.at(r1).size()),
r1.size());
auto r2_pair = std::make_pair(
-static_cast<int>(covers.at(r2).size()),
r2.size());
return r1_pair < r2_pair;
};
std::sort(sorted_parts.begin(), sorted_parts.end(), sort_lambda);
return sorted_parts;
}
regex_covers_t eliminate_dominated(const regex_covers_t& covers)
{
auto covers_copy = covers;
regex_covers_t new_covers;
const auto& sorted_parts = create_sorted_parts_list(covers);
for (const auto& r: sorted_parts) {
if (covers_copy.at(r).empty()) {
// All remaining r must not cover anything
break;
}
auto is_dominated_lambda = [&covers_copy, &r](const covers_pair& pair)
{
const auto& r2 = pair.first;
bool is_superset = std::includes(
pair.second.cbegin(), pair.second.cend(),
covers_copy.at(r).cbegin(), covers_copy.at(r).cend());
bool is_shorter = r2.size() <= r.size();
return is_superset && is_shorter && r != r2;
};
bool is_dominated = std::any_of(
covers_copy.cbegin(), covers_copy.cend(), is_dominated_lambda);
if (!is_dominated) {
new_covers[r] = covers_copy[r];
} else {
covers_copy.erase(covers_copy.find(r));
}
}
return new_covers;
}
/* Driver code I've used for testing. */
int main()
{
auto twelve_sons_of_jacob = std::set<std::string>{
"reuben", "simeon", "levi", "judah", "dan", "naphthali", "gad",
"asher", "issachar", "zebulun", "joseph", "benjamin"};
auto twelve_disciples = std::set<std::string>{
"peter", "andrew", "james", "john", "philip", "bartholomew",
"judas", "judas iscariot", "simon", "thomas", "matthew"};
for (const auto& part: eliminate_dominated(
regex_covers(twelve_sons_of_jacob, twelve_disciples)))
{
std::cout << "'" << part.first << "': [";
for (const auto& covered: part.second) {
std::cout << "'" << covered << "', ";
}
std::cout << ']' << endl;
}
std::cout << endl;
}
For reference, here is Norvig's original, much more readable Python code:
rep_chars = ('*', '+', '?') cat = ''.join def subparts(word, subpart_size=5): "Return a set of subparts of word, consecutive characters up to length 5." return set(word[i:i+1+s] for i in range(len(word)) for s in range(subpart_size)) def dotify(part): "Return all ways to replace a subset of chars in part with '.'." if part == '': return {''} else: return {c+rest for rest in dotify(part[1:]) for c in replacements(part[0])} def replacements(char): "Return replacement characters for char (char + '.' unless char is special)." if (char == '^' or char == '$'): return char else: return char + '.' def repetitions(part): """Return a set of strings derived by inserting a single repetition character ('+' or '*' or '?') after each non-special character. Avoid redundant repetition of dots.""" splits = [(part[:i], part[i:]) for i in range(1, len(part)+1)] return {A + q + B for (A, B) in splits if not (A[-1] in '^$') and not A.endswith('..') and not (A.endswith('.') and B.startswith('.')) for q in rep_chars} def regex_covers(winners, losers): """ Generate regex components and return a dict of {regex: {winner...}}. Each regex matches at least one winner and no loser. """ losers_str = '\n'.join(losers) wholes = {'^'+winner+'$' for winner in winners} parts = {d for w in wholes for p in subparts(w) for d in dotify(p)} chars = set(cat(winners)) pairs = {A+'.'+q+B for A in chars for B in chars for q in rep_chars} reps = {r for p in parts for r in repetitions(p)} pool = wholes | parts | pairs | reps searchers = [re.compile(c, re.MULTILINE).search for c in pool] return {r: set(filter(searcher, winners)) for (r, searcher) in zip(pool, searchers) if not searcher(losers_str)} def eliminate_dominated(covers): """Given a dict of {regex: {winner...}}, make a new dict with only the regexes that are not dominated by any others. A regex r is dominated by r2 if r2 covers a superset of the matches covered by r, and r2 is shorter.""" newcovers = {} def signature(r): return (-len(covers[r]), len(r)) for r in sorted(covers, key=signature): if not covers[r]: break # All remaining r must not cover anything # r goes in newcache if it is not dominated by any other regex if not any(covers[r2] >= covers[r] and len(r2) <= len(r) for r2 in newcovers): newcovers[r] = covers[r] return newcovers
2 Answers 2
Your eliminate_dominated
function is not equivalent to the original. Try this version:
regex_covers_t eliminate_dominated(const regex_covers_t& covers)
{
regex_covers_t new_covers;
const auto& sorted_parts = create_sorted_parts_list(covers);
for (const auto& r: sorted_parts) {
if (covers.at(r).empty()) {
// All remaining r must not cover anything
break;
}
auto is_dominated_lambda =
[&covers, &r](const covers_pair& pair) {
const auto& r2 = pair.first;
bool is_superset = std::includes(
pair.second.cbegin(), pair.second.cend(),
covers.at(r).cbegin(), covers.at(r).cend());
bool is_shorter = r2.size() <= r.size();
return is_superset && is_shorter;
};
bool is_dominated = std::any_of(
new_covers.cbegin(), new_covers.cend(),
is_dominated_lambda);
if (!is_dominated) {
new_covers[r] = get(covers,r);
}
}
return new_covers;
}
With this fixed, the C++ version runs about 3 times faster than the Python version on my system using g++ -O2
.
-
\$\begingroup\$ Although this cuts down runtime to 0.484 seconds for me, and I thank you very much for this answer, I am confused as to why you say they are not equivalent. Both versions of the function produce the same output for me. \$\endgroup\$EMBLEM– EMBLEM2015年04月26日 22:36:08 +00:00Commented Apr 26, 2015 at 22:36
-
\$\begingroup\$ @EMBLEM: I get a different list of regex results in my test. For example, I get '.p.' for ]'joseph', 'naphthali'] instead of '.ph'. This difference may not be significant for your application. The main thing is that your implementation worked in a different way than the original Python code. I suspect that by looking through covers_copy instead of new_covers, it ended up having to search through more possibilities. \$\endgroup\$Vaughn Cato– Vaughn Cato2015年04月26日 23:08:31 +00:00Commented Apr 26, 2015 at 23:08
You might consider replacing std::set
with a different data structure, such as bool[]
(for the char
s, with appropriate typecasting) or std::unordered_set
. The problem with std::set
is that it guarantees to be able to iterate through the set in sorted order.
Have you profiled it? Where is it spending its time?
-
\$\begingroup\$ I need to iterate through the sets in sorted order in order to use
std::set_difference
andstd::set_union
. And I don't want to allow duplicates. \$\endgroup\$EMBLEM– EMBLEM2015年04月26日 18:36:23 +00:00Commented Apr 26, 2015 at 18:36 -
\$\begingroup\$ What's the point of having both
charset
andchars
? Can't you go directly fromwinners_str
tochars
, maybe using a lambda to stringify? \$\endgroup\$Snowbody– Snowbody2015年04月26日 21:21:02 +00:00Commented Apr 26, 2015 at 21:21 -
\$\begingroup\$ I wanted to, but... stackoverflow.com/questions/29779838/… \$\endgroup\$EMBLEM– EMBLEM2015年04月26日 22:33:58 +00:00Commented Apr 26, 2015 at 22:33
-
\$\begingroup\$ You can still write it more efficiently and then extract it to a method. Even in that stackoverflow question it's clear you don't need the intermediate set. \$\endgroup\$Snowbody– Snowbody2015年04月27日 14:45:22 +00:00Commented Apr 27, 2015 at 14:45
clang++ -Wall -Wextra -Werror -pedantic -std=c++11 -stdlib=libc++ -Os -g
\$\endgroup\$-Os
tries to make code smaller, which might disable interesting vectorization options. I'd suggest that you try-O3
(currently highest supported by Clang). Also, that for loop inmain
doesn't look good to me. I'd hoist the function calls outside the loop to be sure the compiler is not doing something stupid like calling them at every iteration. \$\endgroup\$