PEG.js grammar for parsing CSS selectors

Question 1

I have a library that's parsing some expressions, part of which is a CSS selector. More accurately, it's a jQuery-compatible selector. The selector itself is opaque to my library - I don't need to pick it apart, and I don't need to verify that tag and attribute names are valid HTML, or that its pseudo-selectors exist in the current spec. For now, I just need it to match all syntactically valid CSS selectors and not match the rest of the expression, which is delimited by characters that would not be valid in a selector unless escaped or put in a string in an attribute selector. I would use a RegExp and split the string if CSS had no escapes or quoted values, but it started looking write-only really fast.

Does this PEG.js grammar cover all the other edge cases as defined by the CSS standard and jQuery documentation? Could it be more readable? Better in some other way?


{
/* 
 * glom the array contents returned by the parser
 * into a string recursively.
 */
function collapse(stuff) {
 if (typeof stuff === "string") {
 return stuff;
 } else if (stuff instanceof Array) {
 return stuff.reduce(function (a, item) {
 return a.concat(collapse(item));
 }, "")
 } else {
 return "";
 }
}
/*
 * Replace superfluous white space with a single space.
 */
function trimWs(present) {
 if (present) {
 if (present instanceof Array) {
 return !!present.length ? " " : "";
 } else {
 return " ";
 }
 } else {
 return "";
 }
}
}
start = jqSelector
ws "white space" = 
 whites:(" " / "\r" / "\n" / "\t" / "\f")+
{ return collapse(whites); }
iws "ignored white space" = ws? { return ""; }
cws "collapsed white space" = ws:ws { return trimWs(ws); }
jqSelector "jquery-compatible selector" =
 element:(tagIdClassSelector / cssFunctional / cssAttrExpr)+
 more:(iws [\+>~,] iws jqSelector / cws jqSelector)?
 { return collapse([element, more]); }
tagIdClassSelector "tag, id, or class selector" =
 [#\.]? cssIdentifier / "*"
cssIdentifier "CSS identifier" = 
 start:cssIdStart rest:cssIdChar* 
{
 return collapse([start, rest]);
}
cssIdStart "start of CSS identifier" =
 [a-zA-Z_]
 / escapedChar
 / "-" ([a-zA-Z_] / escapedChar)
cssIdChar "rest of CSS identifier" =
 [-a-zA-Z0-9_] / escapedChar
escapedChar "escape sequence" =
 "\\" (escapedUnicode / .)
escapedUnicode "1-6 hexadecimal digits (unicode escape)" = 
 d0:hexd (d1:hexd (d2:hexd (d3:hexd (d4:hexd d5:hexd?)?)?)?)?
{
 return [d0, d1, d2, d3, d4, d5].join("");
}
hexd "hexadecimal digit" =
 [0-9a-fA-F]
cssFunctional "pseudo-selector or functional selector" =
 ":" cssIdentifier ( "(" cssArg ")" )?
cssAttrExpr "attribute selector" =
 "[" cssIdentifier (
 [|~\*^$]? "=" (cssIdentifier / quotedString)
 )? "]"
quotedString "quoted string" = 
 "'" (escapedChar / [^\'])* "'"
 / '"' (escapedChar / [^\"])* '"'
cssArg "functional selector argument" = 
 anpbOddEven / uint / jqSelector
anpbOddEven "'an+b' expression, 'odd', 'even', or integer" = 
 [-\+]? uint? [nN] (ws* [-\+] ws* uint)? 
 / [-\+] uint 
 / "even"
 / "odd"
uint "unsigned integer" = [1-9][0-9]* / "0"

Compiled JS parser & minimal interactive tester is here: http://jsfiddle.net/np6BD/

UPDATE

Incorporating suggestions regarding the JS:


{
/* 
 * glom the array contents returned by the parser
 * into a string recursively.
 * Elses aren't logically necessary to perform this function,
 * but if I use code folding in my IDE, it still looks like it
 * does what it does if I include the elses.
 * Changed name from "collapse" to prevent confusion about
 * the other sense of the word in the cws token.
 * Changed reduce/concat to map/join.
 */
function serialize(stuff) {
 if (typeof stuff === "string") {
 return stuff;
 } else if (stuff instanceof Array) {
 return stuff.map(serialize).join("");
 } else {
 return "";
 }
}
/*
 * Replace superfluous white space with a single space.
 * Function moved to the cws token
 */
}
start = jqSelector
ws "white space" = 
 whites:(" " / "\r" / "\n" / "\t" / "\f")+
{ return serialize(whites); }
iws "ignored white space" = ws? { return ""; }
cws "collapsed white space" = ws:ws 
{ 
 if (ws) {
 if (ws instanceof Array) {
 return !!ws.length ? " " : "";
 } else {
 return " ";
 }
 } else {
 return "";
 }
}
jqSelector "jquery-compatible selector" =
 element:(tagIdClassSelector / cssFunctional / cssAttrExpr)+
 more:(iws [\+>~,] iws jqSelector / cws jqSelector)?
{
 return serialize([element, more]);
}
tagIdClassSelector "tag, id, or class selector" =
 [#\.]? cssIdentifier / "*"
cssIdentifier "CSS identifier" = 
 start:cssIdStart rest:cssIdChar* 
{
 return serialize([start, rest]);
}
cssIdStart "start of CSS identifier" =
 [a-zA-Z_]
 / escapedChar
 / "-" ([a-zA-Z_] / escapedChar)
cssIdChar "rest of CSS identifier" =
 [-a-zA-Z0-9_] / escapedChar
escapedChar "escape sequence" =
 "\\" (escapedUnicode / .)
escapedUnicode "1-6 hexadecimal digits (unicode escape)" = 
 d0:hexd (d1:hexd (d2:hexd (d3:hexd (d4:hexd d5:hexd?)?)?)?)?
{
 return [d0, d1, d2, d3, d4, d5].join("");
}
hexd "hexadecimal digit" =
 [0-9a-fA-F]
cssFunctional "pseudo-selector or functional selector" =
 ":" cssIdentifier ( "(" cssArg ")" )?
cssAttrExpr "attribute selector" =
 "[" cssIdentifier (
 [|~\*^$]? "=" (cssIdentifier / quotedString)
 )? "]"
quotedString "quoted string" = 
 "'" (escapedChar / [^\'])* "'"
 / '"' (escapedChar / [^\"])* '"'
cssArg "functional selector argument" = 
 anpbOddEven / uint / jqSelector
anpbOddEven "'an+b' expression, 'odd', 'even', or integer" = 
 [-\+]? uint? [nN] (ws* [-\+] ws* uint)? 
 / [-\+] uint 
 / "even"
 / "odd"
uint "unsigned integer" = [1-9][0-9]* / "0"

Question 2

I can only review the JS part, I am not sure that grammar reviews are part of CR.

collapse

else branches after a return do not make sense.
I would propose map/join instead of reduce/concat

Something likes this:

function collapse(stuff) {
 if (typeof stuff === "string")
 return stuff;
 if (stuff instanceof Array) 
 return stuff.map(function ( value ) {
 return collapse( value );
 }).join("");
 return "";
}

trimWs

The function does not match the comment, and I cannot see what it is supposed to do. You need a better function name, a better parameter name and a proper comment explaining what it does and how it is used.

Update

trimWs( "abc" ) -> Returns " "
trimWs( " " ) -> Returns " "
trimWs( " " ) -> Returns " "
trimWs( "\t" ) -> Returns " "
trimWs( "" ) -> Returns ""
trimWs( " abc " ) -> Returns " "
If this function were trimming, it would return "abc" for the last call

Basically it returns a space except for provided empty strings and empty arrays. The code could very well be:

function reduceToSingleSpace( x ) {
 return ( !x || ( x && x instanceof Array && !x.length ) )?"":" ";
}

Question 3

trimWs = trim White space. It determines if any white space is present by examining its parameter and returns a single space or nothing. In CSS, a#foo p.bar is the same as a#foo p.bar but not the same as a#foop.bar. Can't get rid of the spaces altogether, but there's no point in holding onto more than one. Can you give me an example of a more appropriate name? Good call on map/join, will do, thanks.

Question 4

In the above comment, the second a#foo p.bar was supposed to have many spaces. Not sure where they went. Ironically, that's exactly what the function does - get rid of extraneous white space.

Question 5

Updated the answer, not too excited by reduceToSingleSpace, but that's the best I could come up with so far.

Question 6

trimWs(" abc ") is where the PEG is important; trimWs is only called to process the cws (for collapsed white space) token. I suppose I should probably both rename collapse to serialize (to avoid confusion about the two senses of the word collapse) and move the body of trimWs to be an anonymous function in the cws token? What do you think?

Question 7

It sounds better than what you have now or what I proposed.

konijn konijn 34.3k5 gold badges71 silver badges267 bronze badges · Answer 1 · 2013-12-27 02:16:56Z

I can only review the JS part, I am not sure that grammar reviews are part of CR.

collapse

else branches after a return do not make sense.
I would propose map/join instead of reduce/concat

Something likes this:

function collapse(stuff) {
 if (typeof stuff === "string")
 return stuff;
 if (stuff instanceof Array) 
 return stuff.map(function ( value ) {
 return collapse( value );
 }).join("");
 return "";
}

trimWs

The function does not match the comment, and I cannot see what it is supposed to do. You need a better function name, a better parameter name and a proper comment explaining what it does and how it is used.

Update

trimWs( "abc" ) -> Returns " "
trimWs( " " ) -> Returns " "
trimWs( " " ) -> Returns " "
trimWs( "\t" ) -> Returns " "
trimWs( "" ) -> Returns ""
trimWs( " abc " ) -> Returns " "
If this function were trimming, it would return "abc" for the last call

Basically it returns a space except for provided empty strings and empty arrays. The code could very well be:

function reduceToSingleSpace( x ) {
 return ( !x || ( x && x instanceof Array && !x.length ) )?"":" ";
}

trimWs = trim White space. It determines if any white space is present by examining its parameter and returns a single space or nothing. In CSS, a#foo p.bar is the same as a#foo p.bar but not the same as a#foop.bar. Can't get rid of the spaces altogether, but there's no point in holding onto more than one. Can you give me an example of a more appropriate name? Good call on map/join, will do, thanks.
In the above comment, the second a#foo p.bar was supposed to have many spaces. Not sure where they went. Ironically, that's exactly what the function does - get rid of extraneous white space.
Updated the answer, not too excited by reduceToSingleSpace, but that's the best I could come up with so far.
trimWs(" abc ") is where the PEG is important; trimWs is only called to process the cws (for collapsed white space) token. I suppose I should probably both rename collapse to serialize (to avoid confusion about the two senses of the word collapse) and move the body of trimWs to be an anonymous function in the cws token? What do you think?

Stack Exchange Network

PEG.js grammar for parsing CSS selectors

1 Answer 1

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Hot Network Questions

PEG.js grammar for parsing CSS selectors

1 Answer 1

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Related

Hot Network Questions