I made another markdown parser. (original, this one on github). Look, I'm not crazy (sorta), I just like making markdown parsers...
String.prototype.replaceAll = function(find, replace) {
if (typeof find == 'string') return this.split(find).join(replace);
var t = this, i, j;
while (typeof(i = find.shift()) == 'string' && typeof(j = replace.shift()) == 'string') t = t.replaceAll(i || '', j || '');
return t;
};
function html(input, replaceQuoteOff) {
if (replaceQuoteOff) return input.toString().replaceAll(['&', '<'], ['&', '<']);
return input.toString().replaceAll(['&', '<', '"'], ['&', '<', '"']);
};
function inlineMarkdown(input) {
var backslash = Math.random().toString();
while (input.indexOf(backslash) != -1) backslash = Math.random().toString();
input = input.replaceAll('\\\\', backslash);
var graveaccent = Math.random().toString();
while (input.indexOf(graveaccent) != -1 || [backslash].indexOf(graveaccent) != -1) graveaccent = Math.random().toString();
input = input.replaceAll('\\`', graveaccent);
var asterisk = Math.random().toString();
while (input.indexOf(asterisk) != -1 || [backslash, graveaccent].indexOf(asterisk) != -1) asterisk = Math.random().toString();
input = input.replaceAll('\\*', asterisk);
var underscore = Math.random().toString();
while (input.indexOf(underscore) != -1 || [backslash, graveaccent, asterisk].indexOf(underscore) != -1) underscore = Math.random().toString();
input = input.replaceAll('\\_', underscore);
var dash = Math.random().toString();
while (input.indexOf(dash) != -1 || [backslash, graveaccent, asterisk, underscore].indexOf(dash) != -1) dash = Math.random().toString();
input = input.replaceAll('\\-', dash);
var plus = Math.random().toString();
while (input.indexOf(plus) != -1 || [backslash, graveaccent, asterisk, underscore, dash].indexOf(plus) != -1) plus = Math.random().toString();
input = input.replaceAll('\\+', plus);
var dot = Math.random().toString();
while (input.indexOf(dot) != -1 || [backslash, graveaccent, asterisk, underscore, dash, plus].indexOf(dot) != -1) dot = Math.random().toString();
input = input.replaceAll('\\.', dot);
var hash = Math.random().toString();
while (input.indexOf(hash) != -1 || [backslash, graveaccent, asterisk, underscore, dash, plus, dot].indexOf(hash) != -1) hash = Math.random().toString();
input = input.replaceAll('\\#', hash);
var gt = Math.random().toString();
while (input.indexOf(gt) != -1 || [backslash, graveaccent, asterisk, underscore, dash, plus, dot, hash].indexOf(gt) != -1) gt = Math.random().toString();
input = input.replaceAll('\\>', gt);
var paren = '#' + Math.random().toString();
while (input.indexOf(paren) != -1) paren = '#' + Math.random().toString();
input = input.replaceAll('\\(', paren);
var cparen = '#' + Math.random().toString();
while (input.indexOf(cparen) != -1 || [paren].indexOf(cparen) != -1) cparen = '#' + Math.random().toString();
input = input.replaceAll('\\)', cparen);
var carrot = '#' + Math.random().toString();
while (input.indexOf(carrot) != -1 || [paren, cparen].indexOf(carrot) != -1) carrot = '#' + Math.random().toString();
input = input.replaceAll('\\^', carrot);
var dollar = '#' + Math.random().toString();
while (input.indexOf(dollar) != -1 || [paren, cparen, carrot].indexOf(dollar) != -1) dollar = '#' + Math.random().toString();
input = input.replaceAll('\\$', dollar);
var open = [];
return input.split('`').map(function(val, i, arr) {
if (i % 2) return '<code>' + html(val.replaceAll([backslash, graveaccent, asterisk, underscore, dash, plus, dot, hash, gt, paren, cparen, carrot, dollar], ['\\\\', '\\`', '\\*', '\\_', '\\-', '\\+', '\\.', '\\#', '\\>', '\\(', '\\)', '\\^'])) + '</code>';
var parsed = val.split('*').map(function(val, i, arr) {
var parsed = val.split('_').map(function(val, i, arr) {
var parsed = val.split('---').map(function(val, i, arr) {
var parsed = val.split('+++').map(function(val, i, arr) {
var parsed = html(val.replaceAll([backslash, graveaccent, asterisk, underscore, dash, plus, dot, hash, gt], ['\\', '`', '*', '_', '-', '+', '.', '#', '>']), true)
.replace(/!\[([^\]]+)]\(([^\s("\\]+\.[^\s()"\\]+)\)/g, '<img alt="1ドル" src="2ドル" />')
.replace(/\[([^\]]+)]\((https?:\/\/[^\s("\\]+\.[^\s()"\\]+)\)/g, '1ドル'.link('2ドル'))
.replace(/([^;["\\])(https?:\/\/([^\s("\\]+\.[^\s()"\\]+))/g, '1ドル' + '3ドル'.link('2ドル'))
.replace(/^(https?:\/\/([^\s("\\]+\.[^\s()"\\]+))/g, '2ドル'.link('1ドル'))
.replace(/\^(\w+)/g, '<sup>1ドル</sup>');
if (i % 2) {
var p = open.indexOf('</ins>')
if (p != -1) {
open.splice(p, 1);
return '</ins>' + parsed;
} else if (arr[i + 1] === undefined) {
open.push('</ins>');
return '<ins>' + parsed;
}
}
return i % 2 ? '<ins>' + parsed + '</ins>' : parsed;
}).join('');
if (i % 2) {
var p = open.indexOf('</del>');
if (p != -1) {
open.splice(p, 1);
return '</del>' + parsed;
} else if (arr[i + 1] === undefined) {
open.push('</del>');
return '<del>' + parsed;
}
}
return i % 2 ? '<del>' + parsed + '</del>' : parsed;
}).join('');
if (i % 2) {
var p = open.indexOf('</strong>');
if (p != -1) {
open.splice(p, 1);
return '</strong>' + parsed;
} else if (arr[i + 1] === undefined) {
open.push('</strong>');
return '<strong>' + parsed;
}
}
return i % 2 ? '<strong>' + parsed + '</strong>' : parsed;
}).join('');
if (i % 2) {
var p = open.indexOf('</em>');
if (p != -1) {
open.splice(p, 1);
return '</em>' + parsed;
} else if (arr[i + 1] === undefined) {
open.push('</em>');
return '<em>' + parsed;
}
}
return i % 2 ? '<em>' + parsed + '</em>' : parsed;
}).join('');
return parsed.replace(/\^\(([^)]+)\)/g, '<sup>1ドル</sup>').replace(/\$\(([^)]+)\)/g, '<sub>1ドル</sub>').replaceAll([paren, cparen, carrot, dollar], ['(', ')', '^', '$']);
}).join('') + open.join('');
};
function markdown(input) {
if (input.indexOf('\n') == -1 && input.substr(0, 2) != '> ' && input.substr(0, 2) != '- ' && input.substr(0, 2) != '* ' && input.substr(0, 4) != ' ' && input[0] != '\t' && !input.match(/^(\w+[.)]|#{1,6}) /)) return inlineMarkdown(input);
var blockquote = '',
ul = '',
ol = '',
li = '',
code = '';
return input.split('\n').map(function(val, i, arr) {
if (!val) return '';
var f;
if (val.substr(0, 2) == '> ') {
val = val.substr(2);
if (arr[i + 1] && arr[i + 1].substr(0, 2) == '> ') {
blockquote += val + '\n';
return '';
} else {
var arg = blockquote + val;
blockquote = '';
return '<blockquote>' + markdown(arg) + '</blockquote>';
}
} else if (val.substr(0, 2) == '- ' || val.substr(0, 2) == '* ') {
if (!ul) ul = '<ul>';
val = val.substr(2);
if (li) {
ul += '<li>' + markdown(li) + '</li>';
li = '';
};
if (arr[i + 1] && (arr[i + 1].substr(0, 2) == '- ' || arr[i + 1] && arr[i + 1].substr(0, 2) == '* ')) {
ul += '<li>' + inlineMarkdown(val) + '</li>';
return '';
} else if (arr[i + 1] && (arr[i + 1][0] == '\t' || arr[i + 1] && arr[i + 1].substr(0, 4) == ' ')) {
li += val + '\n';
return '';
} else {
var arg = ul + '<li>' + markdown(val) + '</li>';
ul = '';
return arg + '</ul>';
}
} else if (f = val.match(/^\w+[.)] /)) {
if (!ol) ol = '<ol>';
val = val.substr(f[0].length);
if (li) {
ol += '<li>' + markdown(li) + '</li>';
li = '';
};
if (arr[i + 1] && arr[i + 1].match(/^\w+[.)] /)) {
ol += '<li>' + inlineMarkdown(val) + '</li>';
return '';
} else if (arr[i + 1] && (arr[i + 1][0] == '\t' || arr[i + 1] && arr[i + 1].substr(0, 4) == ' ')) {
li += val + '\n';
return '';
} else {
var arg = ol + '<li>' + inlineMarkdown(val) + '</li>';
ol = '';
return arg + '</ol>';
}
} else if (li && val[0] == '\t') {
li += val.substr(1) + '\n';
if (ul && (!arr[i + 1] || (arr[i + 1][0] != '\t' && arr[i + 1].substr(0, 4) != ' ' && arr[i + 1].substr(2) != '- ' && arr[i + 1].substr(2) != '* '))) {
var arg = ul + '<li>' + markdown(li) + '</li>';
li = '';
return arg + '</ul>';
} else if (ol && (!arr[i + 1] || (arr[i + 1][0] != '\t' && arr[i + 1].substr(0, 4) != ' ' && !arr[i + 1].match(/^\w+[.)] /)))) {
var arg = ol + '<li>' + markdown(li) + '</li>';
li = '';
return arg + '</ol>';
}
return '';
} else if (li && val.substr(0, 4) == ' ') {
li += val.substr(4) + '\n';
if (ul && (!arr[i + 1] || (arr[i + 1][0] != '\t' && arr[i + 1].substr(0, 4) != ' ' && arr[i + 1].substr(2) != '- ' && arr[i + 1].substr(2) != '* '))) {
var arg = ul + '<li>' + markdown(li) + '</li>';
li = '';
return arg + '</ul>';
} else if (ol && (!arr[i + 1] || (arr[i + 1][0] != '\t' && arr[i + 1].substr(0, 4) != ' ' && !arr[i + 1].match(/^\w+[.)] /)))) {
var arg = ol + '<li>' + markdown(li) + '</li>';
li = '';
return arg + '</ol>';
}
return '';
} else if (val[0] == '\t') {
code += val.substr(1);
if (!arr[i + 1] || (arr[i + 1].substr(0, 4) != ' ' && arr[i + 1][0] != '\t')) {
var arg = html(code);
code = '';
return '<code class="blk">' + arg + '</code>';
} else code += '\n';
return '';
} else if (val.substr(0, 4) == ' ') {
code += val.substr(4);
if (!arr[i + 1] || (arr[i + 1].substr(0, 4) != ' ' && arr[i + 1][0] != '\t')) {
var arg = html(code);
code = '';
return '<code class="blk">' + arg + '</code>';
} else code += '\n';
return '';
} else if ((f = val.match(/^#{1,6} /)) && (f = f[0].length - 1)) {
return '<h' + f + '>' + inlineMarkdown(val.substr(f + 1)) + '</h' + f + '>';
} else return '<p>' + inlineMarkdown(val) + '</p>';
}).join('');
};
Example:
> markdown('*hi*')
<em>hi</em>
There's the String.prototype.replaceAll
thing that people might object to. But it's subjective, there's pros and cons, and I like it. And that part is human-optimized because it's used everywhere.
This is supposed to work on a node server too, so I can't use the DOM for html escapement. And since I don't use single quotes in my HTML, I don't bother replacing that. >
doesn't seem to matter either.
inlineMarkdown
replaces literal characters with random strings so the nested array splitting/mappings don't think they're for formatting, before they are converted back to either the literal character or the original string in the case of inline code. I'm kind of concerned about the nesting and repetition... Should I make the random string generation be a separate function? Should I keep an array of literal character replacements? Is there a better way to write the nested split/maps? Is this method efficient?
markdown
operates line-by-line and figures out what to do based on the first characters of the line. For multi-line things (like lists) it appends the line to a variable and deals with its contents in the end of that chunk. It calls itself recursively for blockquotes and nestey-things. Would it be possible to split not by line, but by chunk? Maybe a nasty regex. Could the repetition with ol
and ul
be avoided? Do I have HTML (XHTML-level strictness) or XSS errors?
I know I'm reinventing the wheel here (again). And I'm not following commonmark (in fact *
is only for italic and _
is for bold). But apart from that, is there anything I can improve in this code?
1 Answer 1
Contrary to your claims to not be crazy, I am not convinced. Using random characters is simply crazy. If I were you I would change the charCode in the string to charCodes with a value under 32 (control characters) that are not 9, 10 or 13. Since you need 14 replacement chars I would go for
- 1 (Start Of Header)
- 2 (Start Of Text)
- 3 (End Of Text)
- 4 (End Of Transmission)
- 5 (Enquiry)
- 6 (Acknowledgement)
- 14 (Shift out) -> skipping bell, backspace, tab, line feed, tab, form feed and return
- 15 (Shift in)
- 16 (Data Link Escape) <- Seriously, how cool was this in 1963?
- 17 (Device Control 1)
- 18 (Device Control 2)
- 19 (Device Control 3)
- 20 (Device Control 4)
- 21 Negative Acknowledgment
If you need more, see here: http://en.wikipedia.org/wiki/ASCII#ASCII_control_code_chart
Other than that:
- Split your code up further in well named functions
- Use comments for your regexes!!
- Use well named constants, for example here
class="blk"
<- blk should have a constant on top
-
\$\begingroup\$ Ok, I might be slightly crazy... :P Is there no way for a control character to appear in some kind of user input tho? \$\endgroup\$bjb568– bjb5682014年11月14日 16:44:59 +00:00Commented Nov 14, 2014 at 16:44
-
\$\begingroup\$ That is right, it cannot appear there. \$\endgroup\$konijn– konijn2014年11月14日 18:16:00 +00:00Commented Nov 14, 2014 at 18:16
-
\$\begingroup\$ Is it necessary to create a
class="blk"
constant? I have that scattered thruout a bunch of server-side js files, css files, HTML... I can find them all if it ever needs to be messed with by doing a multi-file search for "blk". It doesn't make the function itself easier to read either since it only appears once... \$\endgroup\$bjb568– bjb5682014年11月15日 00:54:04 +00:00Commented Nov 15, 2014 at 0:54 -
\$\begingroup\$ Fair point, about the constant, it's a judgement call, feel free to ignore me on this one ;) \$\endgroup\$konijn– konijn2014年11月15日 01:50:43 +00:00Commented Nov 15, 2014 at 1:50
Explore related questions
See similar questions with these tags.
Math.random()
factors in a markdown parser/converter \$\endgroup\$