Most likely, for such tasks the regex library should be used, but I am not sure if this "write-only" language (regex) (for me at least; you write once and can’t read and maintain it at all, rewriting from scratch every time) will be extendable when new requirements come. If I am wrong, I will be thankful for the maintainable version with the regex.
Most likely, for such tasks the regex library should be used, but I am not sure if this "write-only" language (for me at least; you write once and can’t read and maintain it at all, rewriting from scratch every time) will be extendable when new requirements come. If I am wrong, I will be thankful for the maintainable version with the regex.
Most likely, for such tasks the regex library should be used, but I am not sure if this "write-only" language (regex) (for me at least; you write once and can’t read and maintain it at all, rewriting from scratch every time) will be extendable when new requirements come. If I am wrong, I will be thankful for the maintainable version with the regex.
The fully functional demo demo
#include <algorithm>
#include <iostream>
#include <vector>
#include <string.h>
// Returs lexem start point or nullptr if lexem not found
// Moves the passed pointer to the position past the lexem
inline const char* get_lexem(const char*& p)
{
const static std::vector delimiters = { ' ' }; // Could be extened to many different delimiters
const static std::vector<const char*> stable_lexems = { "i.e.", "etc.", "..." }; // Planned to be externally configurable
const static std::vector<char> inword_lexems = { '-', '\'' }; // Not sure how to process this better
const char* start = p;
while (*p && p == start) {
while (delimiters.end() != std::find(delimiters.begin(), delimiters.end(), *p)) {
++p;
if (!*p)
return nullptr;
}
auto it = std::find_if(stable_lexems.begin(), stable_lexems.end(), [&](const char* lexem) {
size_t length = strlen(lexem);
return !strncmp(p, lexem, length);
});
start = p;
if (it != stable_lexems.end()) {
p += strlen(*it);
return start;
}
while (*p && (delimiters.end() == find(delimiters.begin(), delimiters.end(), *p))) {
const bool is_inword_char = inword_lexems.end() != std::find(inword_lexems.begin(), inword_lexems.end(), *p);
if (is_inword_char && p != start && isalpha(*(p - 1))) {
++p;
continue;
}
if (!isalpha(*p) && !isdigit(*p)) {
if (p == start) {
++p;
}
break;
}
++p;
}
}
return start;
}
int main()
{
std::stringconst samplechar sample[] = "Let's conisder this semi-simple sample, i.e. test data with ints: 100, etc. For ... some testing...";
const char* lexem = nullptr;
const char* lexem_end = sample.c_str();sample;
while (true) {
lexem = get_lexem(lexem_end);
if (!(lexem && lexem != lexem_end))
break;
std::string token(lexem, lexem_end - lexem);
std::cout << token << "\n";
}
}
The fully functional demo
#include <algorithm>
#include <iostream>
#include <vector>
#include <string.h>
// Returs lexem start point or nullptr if lexem not found
// Moves the passed pointer to the position past the lexem
inline const char* get_lexem(const char*& p)
{
const static std::vector delimiters = { ' ' }; // Could be extened to many different delimiters
const static std::vector<const char*> stable_lexems = { "i.e.", "etc.", "..." }; // Planned to be externally configurable
const static std::vector<char> inword_lexems = { '-', '\'' }; // Not sure how to process this better
const char* start = p;
while (*p && p == start) {
while (delimiters.end() != std::find(delimiters.begin(), delimiters.end(), *p)) {
++p;
if (!*p)
return nullptr;
}
auto it = std::find_if(stable_lexems.begin(), stable_lexems.end(), [&](const char* lexem) {
size_t length = strlen(lexem);
return !strncmp(p, lexem, length);
});
start = p;
if (it != stable_lexems.end()) {
p += strlen(*it);
return start;
}
while (*p && (delimiters.end() == find(delimiters.begin(), delimiters.end(), *p))) {
const bool is_inword_char = inword_lexems.end() != std::find(inword_lexems.begin(), inword_lexems.end(), *p);
if (is_inword_char && p != start && isalpha(*(p - 1))) {
++p;
continue;
}
if (!isalpha(*p) && !isdigit(*p)) {
if (p == start) {
++p;
}
break;
}
++p;
}
}
return start;
}
int main()
{
std::string sample = "Let's conisder this semi-simple sample, i.e. test data with ints: 100, etc. For ... some testing...";
const char* lexem = nullptr;
const char* lexem_end = sample.c_str();
while (true) {
lexem = get_lexem(lexem_end);
if (!(lexem && lexem != lexem_end))
break;
std::string token(lexem, lexem_end - lexem);
std::cout << token << "\n";
}
}
The fully functional demo
#include <algorithm>
#include <iostream>
#include <vector>
#include <string.h>
// Returs lexem start point or nullptr if lexem not found
// Moves the passed pointer to the position past the lexem
inline const char* get_lexem(const char*& p)
{
const static std::vector delimiters = { ' ' }; // Could be extened to many different delimiters
const static std::vector<const char*> stable_lexems = { "i.e.", "etc.", "..." }; // Planned to be externally configurable
const static std::vector<char> inword_lexems = { '-', '\'' }; // Not sure how to process this better
const char* start = p;
while (*p && p == start) {
while (delimiters.end() != std::find(delimiters.begin(), delimiters.end(), *p)) {
++p;
if (!*p)
return nullptr;
}
auto it = std::find_if(stable_lexems.begin(), stable_lexems.end(), [&](const char* lexem) {
size_t length = strlen(lexem);
return !strncmp(p, lexem, length);
});
start = p;
if (it != stable_lexems.end()) {
p += strlen(*it);
return start;
}
while (*p && (delimiters.end() == find(delimiters.begin(), delimiters.end(), *p))) {
const bool is_inword_char = inword_lexems.end() != std::find(inword_lexems.begin(), inword_lexems.end(), *p);
if (is_inword_char && p != start && isalpha(*(p - 1))) {
++p;
continue;
}
if (!isalpha(*p) && !isdigit(*p)) {
if (p == start) {
++p;
}
break;
}
++p;
}
}
return start;
}
int main()
{
const char sample[] = "Let's conisder this semi-simple sample, i.e. test data with ints: 100, etc. For ... some testing...";
const char* lexem = nullptr;
const char* lexem_end = sample;
while (true) {
lexem = get_lexem(lexem_end);
if (!(lexem && lexem != lexem_end))
break;
std::string token(lexem, lexem_end - lexem);
std::cout << token << "\n";
}
}
With all these static vectors and intention to make it configurable and extendable in future, I am in two minds if to make a class or namespace from this in order to being able to configure with delimiters, stable lexems, inwords lexems, etc. Would it be extendability or overengineering?
The code
The code
With all these static vectors and intention to make it configurable and extendable in future, I am in two minds if to make a class or namespace from this in order to being able to configure with delimiters, stable lexems, inwords lexems, etc. Would it be extendability or overengineering?
The code