Markdown to HTML, again

Question 1

I made another markdown parser. (original, this one on github). Look, I'm not crazy (sorta), I just like making markdown parsers...

String.prototype.replaceAll = function(find, replace) {
 if (typeof find == 'string') return this.split(find).join(replace);
 var t = this, i, j;
 while (typeof(i = find.shift()) == 'string' && typeof(j = replace.shift()) == 'string') t = t.replaceAll(i || '', j || '');
 return t;
};
function html(input, replaceQuoteOff) {
 if (replaceQuoteOff) return input.toString().replaceAll(['&', '<'], ['&amp;', '&lt;']);
 return input.toString().replaceAll(['&', '<', '"'], ['&amp;', '&lt;', '&quot;']);
};
function inlineMarkdown(input) {
 var backslash = Math.random().toString();
 while (input.indexOf(backslash) != -1) backslash = Math.random().toString();
 input = input.replaceAll('\\\\', backslash);
 var graveaccent = Math.random().toString();
 while (input.indexOf(graveaccent) != -1 || [backslash].indexOf(graveaccent) != -1) graveaccent = Math.random().toString();
 input = input.replaceAll('\\`', graveaccent);
 var asterisk = Math.random().toString();
 while (input.indexOf(asterisk) != -1 || [backslash, graveaccent].indexOf(asterisk) != -1) asterisk = Math.random().toString();
 input = input.replaceAll('\\*', asterisk);
 var underscore = Math.random().toString();
 while (input.indexOf(underscore) != -1 || [backslash, graveaccent, asterisk].indexOf(underscore) != -1) underscore = Math.random().toString();
 input = input.replaceAll('\\_', underscore);
 var dash = Math.random().toString();
 while (input.indexOf(dash) != -1 || [backslash, graveaccent, asterisk, underscore].indexOf(dash) != -1) dash = Math.random().toString();
 input = input.replaceAll('\\-', dash);
 var plus = Math.random().toString();
 while (input.indexOf(plus) != -1 || [backslash, graveaccent, asterisk, underscore, dash].indexOf(plus) != -1) plus = Math.random().toString();
 input = input.replaceAll('\\+', plus);
 var dot = Math.random().toString();
 while (input.indexOf(dot) != -1 || [backslash, graveaccent, asterisk, underscore, dash, plus].indexOf(dot) != -1) dot = Math.random().toString();
 input = input.replaceAll('\\.', dot);
 var hash = Math.random().toString();
 while (input.indexOf(hash) != -1 || [backslash, graveaccent, asterisk, underscore, dash, plus, dot].indexOf(hash) != -1) hash = Math.random().toString();
 input = input.replaceAll('\\#', hash);
 var gt = Math.random().toString();
 while (input.indexOf(gt) != -1 || [backslash, graveaccent, asterisk, underscore, dash, plus, dot, hash].indexOf(gt) != -1) gt = Math.random().toString();
 input = input.replaceAll('\\>', gt);
 var paren = '#' + Math.random().toString();
 while (input.indexOf(paren) != -1) paren = '#' + Math.random().toString();
 input = input.replaceAll('\\(', paren);
 var cparen = '#' + Math.random().toString();
 while (input.indexOf(cparen) != -1 || [paren].indexOf(cparen) != -1) cparen = '#' + Math.random().toString();
 input = input.replaceAll('\\)', cparen);
 var carrot = '#' + Math.random().toString();
 while (input.indexOf(carrot) != -1 || [paren, cparen].indexOf(carrot) != -1) carrot = '#' + Math.random().toString();
 input = input.replaceAll('\\^', carrot);
 var dollar = '#' + Math.random().toString();
 while (input.indexOf(dollar) != -1 || [paren, cparen, carrot].indexOf(dollar) != -1) dollar = '#' + Math.random().toString();
 input = input.replaceAll('\\$', dollar);
 var open = [];
 return input.split('`').map(function(val, i, arr) {
 if (i % 2) return '<code>' + html(val.replaceAll([backslash, graveaccent, asterisk, underscore, dash, plus, dot, hash, gt, paren, cparen, carrot, dollar], ['\\\\', '\\`', '\\*', '\\_', '\\-', '\\+', '\\.', '\\#', '\\>', '\\(', '\\)', '\\^'])) + '</code>';
 var parsed = val.split('*').map(function(val, i, arr) {
 var parsed = val.split('_').map(function(val, i, arr) {
 var parsed = val.split('---').map(function(val, i, arr) {
 var parsed = val.split('+++').map(function(val, i, arr) {
 var parsed = html(val.replaceAll([backslash, graveaccent, asterisk, underscore, dash, plus, dot, hash, gt], ['\\', '`', '*', '_', '-', '+', '.', '#', '>']), true)
 .replace(/!\[([^\]]+)]\(([^\s("\\]+\.[^\s()"\\]+)\)/g, '<img alt="1ドル" src="2ドル" />')
 .replace(/\[([^\]]+)]\((https?:\/\/[^\s("\\]+\.[^\s()"\\]+)\)/g, '1ドル'.link('2ドル'))
 .replace(/([^;["\\])(https?:\/\/([^\s("\\]+\.[^\s()"\\]+))/g, '1ドル' + '3ドル'.link('2ドル'))
 .replace(/^(https?:\/\/([^\s("\\]+\.[^\s()"\\]+))/g, '2ドル'.link('1ドル'))
 .replace(/\^(\w+)/g, '<sup>1ドル</sup>');
 if (i % 2) {
 var p = open.indexOf('</ins>')
 if (p != -1) {
 open.splice(p, 1);
 return '</ins>' + parsed;
 } else if (arr[i + 1] === undefined) {
 open.push('</ins>');
 return '<ins>' + parsed;
 }
 }
 return i % 2 ? '<ins>' + parsed + '</ins>' : parsed;
 }).join('');
 if (i % 2) {
 var p = open.indexOf('</del>');
 if (p != -1) {
 open.splice(p, 1);
 return '</del>' + parsed;
 } else if (arr[i + 1] === undefined) {
 open.push('</del>');
 return '<del>' + parsed;
 }
 }
 return i % 2 ? '<del>' + parsed + '</del>' : parsed;
 }).join('');
 if (i % 2) {
 var p = open.indexOf('</strong>');
 if (p != -1) {
 open.splice(p, 1);
 return '</strong>' + parsed;
 } else if (arr[i + 1] === undefined) {
 open.push('</strong>');
 return '<strong>' + parsed;
 }
 }
 return i % 2 ? '<strong>' + parsed + '</strong>' : parsed;
 }).join('');
 if (i % 2) {
 var p = open.indexOf('</em>');
 if (p != -1) {
 open.splice(p, 1);
 return '</em>' + parsed;
 } else if (arr[i + 1] === undefined) {
 open.push('</em>');
 return '<em>' + parsed;
 }
 }
 return i % 2 ? '<em>' + parsed + '</em>' : parsed;
 }).join('');
 return parsed.replace(/\^\(([^)]+)\)/g, '<sup>1ドル</sup>').replace(/\$\(([^)]+)\)/g, '<sub>1ドル</sub>').replaceAll([paren, cparen, carrot, dollar], ['(', ')', '^', '$']);
 }).join('') + open.join('');
};
function markdown(input) {
 if (input.indexOf('\n') == -1 && input.substr(0, 2) != '> ' && input.substr(0, 2) != '- ' && input.substr(0, 2) != '* ' && input.substr(0, 4) != ' ' && input[0] != '\t' && !input.match(/^(\w+[.)]|#{1,6}) /)) return inlineMarkdown(input);
 var blockquote = '',
 ul = '',
 ol = '',
 li = '',
 code = '';
 return input.split('\n').map(function(val, i, arr) {
 if (!val) return '';
 var f;
 if (val.substr(0, 2) == '> ') {
 val = val.substr(2);
 if (arr[i + 1] && arr[i + 1].substr(0, 2) == '> ') {
 blockquote += val + '\n';
 return '';
 } else {
 var arg = blockquote + val;
 blockquote = '';
 return '<blockquote>' + markdown(arg) + '</blockquote>';
 }
 } else if (val.substr(0, 2) == '- ' || val.substr(0, 2) == '* ') {
 if (!ul) ul = '<ul>';
 val = val.substr(2);
 if (li) {
 ul += '<li>' + markdown(li) + '</li>';
 li = '';
 };
 if (arr[i + 1] && (arr[i + 1].substr(0, 2) == '- ' || arr[i + 1] && arr[i + 1].substr(0, 2) == '* ')) {
 ul += '<li>' + inlineMarkdown(val) + '</li>';
 return '';
 } else if (arr[i + 1] && (arr[i + 1][0] == '\t' || arr[i + 1] && arr[i + 1].substr(0, 4) == ' ')) {
 li += val + '\n';
 return '';
 } else {
 var arg = ul + '<li>' + markdown(val) + '</li>';
 ul = '';
 return arg + '</ul>';
 }
 } else if (f = val.match(/^\w+[.)] /)) {
 if (!ol) ol = '<ol>';
 val = val.substr(f[0].length);
 if (li) {
 ol += '<li>' + markdown(li) + '</li>';
 li = '';
 };
 if (arr[i + 1] && arr[i + 1].match(/^\w+[.)] /)) {
 ol += '<li>' + inlineMarkdown(val) + '</li>';
 return '';
 } else if (arr[i + 1] && (arr[i + 1][0] == '\t' || arr[i + 1] && arr[i + 1].substr(0, 4) == ' ')) {
 li += val + '\n';
 return '';
 } else {
 var arg = ol + '<li>' + inlineMarkdown(val) + '</li>';
 ol = '';
 return arg + '</ol>';
 }
 } else if (li && val[0] == '\t') {
 li += val.substr(1) + '\n';
 if (ul && (!arr[i + 1] || (arr[i + 1][0] != '\t' && arr[i + 1].substr(0, 4) != ' ' && arr[i + 1].substr(2) != '- ' && arr[i + 1].substr(2) != '* '))) {
 var arg = ul + '<li>' + markdown(li) + '</li>';
 li = '';
 return arg + '</ul>';
 } else if (ol && (!arr[i + 1] || (arr[i + 1][0] != '\t' && arr[i + 1].substr(0, 4) != ' ' && !arr[i + 1].match(/^\w+[.)] /)))) {
 var arg = ol + '<li>' + markdown(li) + '</li>';
 li = '';
 return arg + '</ol>';
 }
 return '';
 } else if (li && val.substr(0, 4) == ' ') {
 li += val.substr(4) + '\n';
 if (ul && (!arr[i + 1] || (arr[i + 1][0] != '\t' && arr[i + 1].substr(0, 4) != ' ' && arr[i + 1].substr(2) != '- ' && arr[i + 1].substr(2) != '* '))) {
 var arg = ul + '<li>' + markdown(li) + '</li>';
 li = '';
 return arg + '</ul>';
 } else if (ol && (!arr[i + 1] || (arr[i + 1][0] != '\t' && arr[i + 1].substr(0, 4) != ' ' && !arr[i + 1].match(/^\w+[.)] /)))) {
 var arg = ol + '<li>' + markdown(li) + '</li>';
 li = '';
 return arg + '</ol>';
 }
 return '';
 } else if (val[0] == '\t') {
 code += val.substr(1);
 if (!arr[i + 1] || (arr[i + 1].substr(0, 4) != ' ' && arr[i + 1][0] != '\t')) {
 var arg = html(code);
 code = '';
 return '<code class="blk">' + arg + '</code>';
 } else code += '\n';
 return '';
 } else if (val.substr(0, 4) == ' ') {
 code += val.substr(4);
 if (!arr[i + 1] || (arr[i + 1].substr(0, 4) != ' ' && arr[i + 1][0] != '\t')) {
 var arg = html(code);
 code = '';
 return '<code class="blk">' + arg + '</code>';
 } else code += '\n';
 return '';
 } else if ((f = val.match(/^#{1,6} /)) && (f = f[0].length - 1)) {
 return '<h' + f + '>' + inlineMarkdown(val.substr(f + 1)) + '</h' + f + '>';
 } else return '<p>' + inlineMarkdown(val) + '</p>';
 }).join('');
};

Example:

> markdown('*hi*')
<em>hi</em>

There's the String.prototype.replaceAll thing that people might object to. But it's subjective, there's pros and cons, and I like it. And that part is human-optimized because it's used everywhere.

This is supposed to work on a node server too, so I can't use the DOM for html escapement. And since I don't use single quotes in my HTML, I don't bother replacing that. > doesn't seem to matter either.

inlineMarkdown replaces literal characters with random strings so the nested array splitting/mappings don't think they're for formatting, before they are converted back to either the literal character or the original string in the case of inline code. I'm kind of concerned about the nesting and repetition... Should I make the random string generation be a separate function? Should I keep an array of literal character replacements? Is there a better way to write the nested split/maps? Is this method efficient?

markdown operates line-by-line and figures out what to do based on the first characters of the line. For multi-line things (like lists) it appends the line to a variable and deals with its contents in the end of that chunk. It calls itself recursively for blockquotes and nestey-things. Would it be possible to split not by line, but by chunk? Maybe a nasty regex. Could the repetition with ol and ul be avoided? Do I have HTML (XHTML-level strictness) or XSS errors?

I know I'm reinventing the wheel here (again). And I'm not following commonmark (in fact * is only for italic and _ is for bold). But apart from that, is there anything I can improve in this code?

Question 2

You're being a bit stingy with whitespace, which makes the code difficult to review. There are also no comments, and while minimal commenting is good practice, having no comments whatsoever on a monolithic function is generally not a good idea.

Question 3

jsbeautifier.com or some other. It will be more pleasant to read.

Question 4

I don't understand how Math.random() factors in a markdown parser/converter

Question 5

@MadaraUchiha Literal character expressions like '\*' are replaced with random strings to not be seen in things looking for the character ('*').

Question 6

Contrary to your claims to not be crazy, I am not convinced. Using random characters is simply crazy. If I were you I would change the charCode in the string to charCodes with a value under 32 (control characters) that are not 9, 10 or 13. Since you need 14 replacement chars I would go for

1 (Start Of Header)
2 (Start Of Text)
3 (End Of Text)
4 (End Of Transmission)
5 (Enquiry)
6 (Acknowledgement)
14 (Shift out) -> skipping bell, backspace, tab, line feed, tab, form feed and return
15 (Shift in)
16 (Data Link Escape) <- Seriously, how cool was this in 1963?
17 (Device Control 1)
18 (Device Control 2)
19 (Device Control 3)
20 (Device Control 4)
21 Negative Acknowledgment

If you need more, see here: http://en.wikipedia.org/wiki/ASCII#ASCII_control_code_chart

Other than that:

Split your code up further in well named functions
Use comments for your regexes!!
Use well named constants, for example here class="blk" <- blk should have a constant on top

Question 7

Ok, I might be slightly crazy... :P Is there no way for a control character to appear in some kind of user input tho?

Question 8

That is right, it cannot appear there.

Question 9

Is it necessary to create a class="blk" constant? I have that scattered thruout a bunch of server-side js files, css files, HTML... I can find them all if it ever needs to be messed with by doing a multi-file search for "blk". It doesn't make the function itself easier to read either since it only appears once...

Question 10

Fair point, about the constant, it's a judgement call, feel free to ignore me on this one ;)

konijn konijn 34.2k5 gold badges70 silver badges267 bronze badges · Answer 1 · 2014-11-14 15:58:39Z

Contrary to your claims to not be crazy, I am not convinced. Using random characters is simply crazy. If I were you I would change the charCode in the string to charCodes with a value under 32 (control characters) that are not 9, 10 or 13. Since you need 14 replacement chars I would go for

1 (Start Of Header)
2 (Start Of Text)
3 (End Of Text)
4 (End Of Transmission)
5 (Enquiry)
6 (Acknowledgement)
14 (Shift out) -> skipping bell, backspace, tab, line feed, tab, form feed and return
15 (Shift in)
16 (Data Link Escape) <- Seriously, how cool was this in 1963?
17 (Device Control 1)
18 (Device Control 2)
19 (Device Control 3)
20 (Device Control 4)
21 Negative Acknowledgment

If you need more, see here: http://en.wikipedia.org/wiki/ASCII#ASCII_control_code_chart

Other than that:

Split your code up further in well named functions
Use comments for your regexes!!
Use well named constants, for example here class="blk" <- blk should have a constant on top

Ok, I might be slightly crazy... :P Is there no way for a control character to appear in some kind of user input tho?
Is it necessary to create a class="blk" constant? I have that scattered thruout a bunch of server-side js files, css files, HTML... I can find them all if it ever needs to be messed with by doing a multi-file search for "blk". It doesn't make the function itself easier to read either since it only appears once...
Fair point, about the constant, it's a judgement call, feel free to ignore me on this one ;)

Stack Exchange Network

Markdown to HTML, again

1 Answer 1

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Linked

Hot Network Questions

Markdown to HTML, again

1 Answer 1

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Linked

Related

Hot Network Questions