Task:
This function searches given null terminated string pStr
by given subset of
regular expression pMatch
.
Return value is matched string appeared in pStr
.
If nothing has matched, return value is empty string.
The subset of regular expression grammar is defined as follows:
- A string is set of 8-bit characters. Beware that it is not 7-bit.
- The metacharacter
\
is an escape character, that means any single letter appeared next used as is. - The characters in squared brackets metacharacters
[
and]
is a sequence of literal characters or metacharacters. Immediately after]
, another metacharacters in curly braces{
num}
must be followed. The num is a sequence of digits which specifies the number of times the previous literal character or metacharacter sequence in squared brackets must be repeated. e.g. "abc[def]{2}ghi
" matches "abcdefdefghi
". - The squared brackets metacharacters can be nested.
e.g. "
abc[def[ghi]{3}jkl]{2}mno
" matches "abcdefghighighijkldefghighighijklmno
". - In case of matching pattern given by
pMatch
is a grammatical error, this function returns a string "ERROR
".
My Solution:
Define a state machine and put regular expression strings into stacks. When used then pop up, multiple by N
times and push to stack again.
escape <-- escape <---- escape <-----
| | | | | |
| | | | | |
normal --> repeatStrStart --> repeatStrEnd --> repeatNumStart --
^ ^ | |
| | | |
| ------------ |
----------------------------------------------------
Code:
string PatternSearch(const unsigned char* pStr, const unsigned char* pMatch)
{
enum status {
normal,
escape,
repeatStrStart,
repeatStrEnd,
repeatNumStart
};
status current = normal;
status restore = normal;
const static string strErr = "ERROR";
stringstream bufNormal;
stringstream bufRepeat;
stringstream bufTmp;
stack<unsigned char> stackBracket;
stack<string> stackRepeat;
unsigned long repeat = 0;
size_t check = 0;
int i = 0;
for(i=0; pMatch[i] != '0円'; i++) {
if(current == normal) {
if(pMatch[i] == '\\') {
restore = current;
current = escape;
}
else if(pMatch[i] == '[') {
current = repeatStrStart;
stackBracket.push(pMatch[i]);
}
else {
bufNormal << pMatch[i];
}
}
else if(current == escape) {
if(restore == normal) {
bufNormal << pMatch[i];
current = restore;
}
else if(restore == repeatStrStart) {
bufRepeat << pMatch[i];
current = restore;
}
else if(restore == repeatNumStart) {
bufRepeat << pMatch[i];
current = restore;
}
else {
bufNormal.str(strErr);
bufNormal.clear();
break;
}
}
else if(current == repeatStrStart) {
if(pMatch[i] == '\\') {
restore = current;
current = escape;
}
else if(pMatch[i] == '[') {
stackBracket.push(pMatch[i]);
stackRepeat.push(bufRepeat.str());
bufRepeat.str("");
bufRepeat.clear();
}
else if(pMatch[i] == ']') {
if(stackBracket.empty() == false && stackBracket.top() == '[') {
current = repeatStrEnd;
if(stackBracket.size() > stackRepeat.size()) {
stackRepeat.push(bufRepeat.str());
}
else {
bufTmp.str("");
bufTmp.clear();
bufTmp << stackRepeat.top();
bufTmp << bufRepeat.str();
stackRepeat.pop();
stackRepeat.push(bufTmp.str());
bufTmp.str("");
bufTmp.clear();
}
stackBracket.pop();
bufRepeat.str("");
bufRepeat.clear();
}
else {
bufNormal.str(strErr);
bufNormal.clear();
break;
}
}
else {
bufRepeat << pMatch[i];
}
}
else if(current == repeatStrEnd) {
if(pMatch[i] == '{') {
stackBracket.push(pMatch[i]);
current = repeatNumStart;
}
else {
bufNormal.str(strErr);
bufNormal.clear();
break;
}
}
else {
if(pMatch[i] == '\\') {
restore = current;
current = escape;
}
else if(pMatch[i] == '}') {
if(stackBracket.empty() == false && stackBracket.top() == '{') {
stackBracket.pop();
try {
repeat = stoul(/*str=*/bufRepeat.str(), /*idx=*/&check);
if(check == bufRepeat.str().length()) {
bufRepeat.str("");
bufRepeat.clear();
while(repeat--) {
bufRepeat << stackRepeat.top();
}
stackRepeat.pop();
if(stackRepeat.empty()) {
bufNormal << bufRepeat.str();
}
else {
bufTmp.str("");
bufTmp.clear();
bufTmp << stackRepeat.top();
bufTmp << bufRepeat.str();
stackRepeat.pop();
stackRepeat.push(bufTmp.str());
}
bufTmp.str("");
bufTmp.clear();
bufRepeat.str("");
bufRepeat.clear();
}
else {
bufNormal.str(strErr);
bufNormal.clear();
break;
}
}
catch(invalid_argument e) {
bufNormal.str(strErr);
bufNormal.clear();
break;
}
catch(out_of_range e) {
bufNormal.str(strErr);
bufNormal.clear();
break;
}
if(stackBracket.empty()) {
current = normal;
}
else {
current = repeatStrStart;
}
}
else {
bufRepeat.str(strErr);
bufRepeat.clear();
break;
}
}
else {
bufRepeat << pMatch[i];
}
}
}
if(stackBracket.empty() == false || stackRepeat.empty() == false) {
bufNormal.str(strErr);
bufNormal.clear();
}
else if(current != normal) {
bufNormal.str(strErr);
bufNormal.clear();
}
else {
bufTmp.str("");
bufTmp.clear();
bufTmp << pStr;
if(bufTmp.str().find(bufNormal.str()) == string::npos) {
bufNormal.str("");
bufNormal.clear();
}
}
return bufNormal.str();
}
1 Answer 1
Speaking from a purely style point of view, nine levels of indentation is a substantial red flag, when you should be able to keep it at three.
So much indentation makes this code hard to read in the first place, which is not the sort of practice you want to get into, especially if your code will be read and evaluated by others.
std::regex
? \$\endgroup\$