I have a string and I want is split into an array so that it is split by '+' unless it is inside brackets
E.g. the string
"abc+OR+def+OR+(abc+AND+def)"
becomes
["abc", "OR", "def", "OR", "(abc+AND+def)"]
and the string
"(abc+AND+cde)+OR+(abc+AND+(cde+AND+fgh))"
becomes
["(abc+AND+cde)", "OR", "(abc+AND+(cde+AND+fgh)"]
Is it possible to do this using regular expressions?
-
4Wouldn't it be easier to use a real parser instead?Dave Newton– Dave Newton2012年08月08日 18:29:17 +00:00Commented Aug 8, 2012 at 18:29
5 Answers 5
You can do this with regex, but only with that languages that support recursive regular expression (for example, perl or any language wit PCRE).
It is not easy with JavaScript regexes, because they do not support recursion. But it is possible using XRegExp using additional plugin:
http://xregexp.com/plugins/#matchRecursive
Also please check these two links:
Comments
I don't think you could do this with regex. EDIT: per Silver, you could use regex.
One way would be to just parse the string character by character. I'll edit my answer with code in a minute.
EDIT: Here's a sample implementation (note: untested, may have a bug or two):
function parseString (str) {
var splitStr = [], parentheses = 0, i = 0
for (var j = 0; j < str.length; j++) {
if (str[j] == '+' && !parentheses)
i++
else if (str[j] == '(')
parentheses++
else if (str[j] == ')')
parentheses--
else
splitStr[i] += str[j]
}
return splitStr
}
Comments
You can use the match method of String object to do this and use the following regex:
stringObj.match(/([a-zA-Z]+)|([(]([a-zA-Z]+[+])+[a-zA-Z]+[)])+/gi);
Comments
This regular expression would suit your needs.
(?!=\([\w\+]+)\+(?![\w+\+]+\))
See it in action here.
There is one small problem: Negative lookbehind (?!=...) is not implemented in the javascript regular expression parser.
For anyone who is learning regular expressions, here is a walkthrough:
(?!=\([\w\+]+) is a negative lookbehind. It means "not preceeded by ..." In this case, we're looking for something not preceeded by (lettersOr+.
\+ is what we are looking for. A plus sign (escaped)
(?![\w+\+]+\)) is a negative lookahead. It means "not followed by ..." In this case, we're looking for something not followed by lettersOr+)
Comments
This function should work for you:
var PARENTH_STRING_PLACE_HOLDER = '__PARSTRINGHOLDER__';
var splitPlusNoParenthesis = function(str){
//Replace the parenthStrings with the placeholder
var parenthStrings = getParenthesizedStrings(str);
for(var i = 0; i < parenthStrings.length; i++){
str = str.replace(parenthStrings[i], PARENTH_STRING_PLACE_HOLDER);
}
//Split on '+'
var splitString = str.split('+');
//Replace all placeholders with the actual values
var parIndex = 0;
for(var i = 0; i < splitString.length; i++){
if(splitString[i] === PARENTH_STRING_PLACE_HOLDER){
splitString[i] = parenthStrings[parIndex++];
}
}
return splitString;
};
var getParenthesizedStrings = function(str){
var parenthStrings = [];
for(var startIndex = 0; startIndex < str.length; startIndex++){
if(str[startIndex] === '('){
var parenthCount = 1;
var endIndex = startIndex + 1;
for(; endIndex < str.length; endIndex++){
var character = str[endIndex];
if(character === '('){
parenthCount++;
} else if(character === ')'){
parenthCount--;
}
if(!parenthCount){
parenthStrings.push(str.substring(startIndex, endIndex + 1));
break;
}
}
startIndex = endIndex;
}
}
return parenthStrings;
};