6
\$\begingroup\$

For a side project, I needed a markdown parser and I decided to roll my own.

It is a SAX-style parser, i.o.w. you can hook into parser events on consumer side and do whatever you want with the content.

markdown-parser.h

#include <stddef.h>
#include <stdio.h>
#include <string.h>
#include <stdbool.h>
struct markdown_parser;
struct block_parselet {
 int (*matches)(const char* line, size_t len);
 void (*parse)(struct markdown_parser *parser);
};
struct inline_parselet {
 /**
 * This is called for each inline parselet to determine if they can
 * handle inline content starting at `text` with given length.
 *
 * A parselet can inspect the content for the markers it is interested
 * in and should set span_start and span_len to describe the span of
 * the content within these markers. It should additionally return the
 * length of the text that would be consumed on parsing this span,
 * including any markers.
 *
 * A parselet not willing to handle content at this point should return 
 * 0.
 */
 size_t (*get_span)(const char *text, size_t len,
 size_t *span_start, size_t *span_len);
 /**
 * This is called for each span reported by a parselet via `get_span`. 
 * A parselet should fire its callbacks here and optionally call
 * parse_inline() if the element allows nesting inline elements.
 */
 size_t (*parse_span)(const char *span, size_t span_len,
 struct markdown_parser *parser);
};
struct markdown_callbacks {
 void (*h_begin)(void *ctx, int level);
 void (*h_end)(void *ctx);
 void (*p_begin)(void *ctx);
 void (*p_end)(void *ctx);
 void (*ul_begin)(void *ctx);
 void (*ul_end)(void *ctx);
 void (*li_begin)(void *ctx);
 void (*li_end)(void *ctx);
 void (*text)(void* ctx, const char* text, size_t len);
 void (*em_begin)(void* ctx);
 void (*em_end)(void* ctx);
 void (*i_begin)(void* ctx);
 void (*i_end)(void* ctx);
};
struct line_stream {
 const char* buffer;
 size_t length;
 size_t curr;
 const char* current_line;
 size_t current_len;
};
struct markdown_parser {
 const struct markdown_callbacks *cb;
 void *ctx;
 struct line_stream line_stream;
 struct inline_parselet* inline_parselets;
 size_t inline_parselets_len;
 size_t inline_parselets_capacity;
 struct block_parselet* block_parselets;
 size_t block_parselets_len;
 size_t block_parselets_capacity;
};
struct markdown_parser make_markdown_parser(const char *buffer, size_t length,
 const struct markdown_callbacks *cb,
 void *ctx);
void parse_markdown(struct markdown_parser *parser);

markdown-parser.c:

#include <stdlib.h>
#include "markdown-parser.h"
static struct line_stream make_line_stream(const char *buffer, size_t length) {
 return (struct line_stream){
 .buffer = buffer,
 .length = length,
 .curr = 0,
 .current_line = NULL,
 .current_len = 0
 };
}
static int line_stream_has_next(const struct line_stream* s) {
 return s->curr < s->length;
}
static void line_stream_advance(struct line_stream* s) {
 if (s->curr >= s->length) {
 s->current_line = s->buffer + s->length;
 s->current_len = 0;
 return;
 }
 size_t start = s->curr;
 size_t end = start;
 while (end < s->length && s->buffer[end] != '\n' && s->buffer[end] != '\r') {
 end++;
 }
 s->current_line = s->buffer + start;
 s->current_len = end - start;
 if (end < s->length) {
 if (s->buffer[end] == '\r' && end + 1 < s->length && s->buffer[end + 1] == '\n') {
 s->curr = end + 2;
 } else {
 s->curr = end + 1;
 }
 } else {
 s->curr = end;
 }
}
static void line_stream_get_current(const struct line_stream* s, const char** out_line,
 size_t* out_len) {
 *out_line = s->current_line;
 *out_len = s->current_len;
}
static void register_inline_parselet(struct markdown_parser *parser, struct inline_parselet p) {
 if (parser->inline_parselets_capacity >= parser->inline_parselets_len) {
 size_t new_capacity = parser->inline_parselets_capacity == 0 ? 4 :
 parser->inline_parselets_capacity * 2;
 parser->inline_parselets_capacity = new_capacity;
 parser->inline_parselets = realloc(parser->inline_parselets,
 parser->inline_parselets_capacity * sizeof(p));
 }
 parser->inline_parselets[parser->inline_parselets_len++] = p;
}
static void parse_inline(const char* text, size_t len,
 struct markdown_parser* parser) {
 size_t i = 0;
 size_t text_span_start = 0;
 while (i < len) {
 bool matched = false;
 for (size_t j = 0; j < parser->inline_parselets_len; j++) {
 struct inline_parselet *parselet = &parser->inline_parselets[j];
 size_t inline_span_start, inline_span_len;
 size_t consumed = parselet->get_span(text + i, len - i, &inline_span_start,
 &inline_span_len);
 if (consumed == 0) {
 continue;
 }
 /* Flush accumulated text before this inline span */
 if (i > text_span_start && parser->cb->text) {
 parser->cb->text(parser->ctx, text + text_span_start, i - text_span_start);
 }
 parselet->parse_span(text + i + inline_span_start, inline_span_len,
 parser);
 i += consumed;
 text_span_start = i;
 matched = true;
 break;
 }
 if (!matched) {
 i++;
 }
 }
 /* Flush left over accululated text */
 if (text_span_start < len && parser->cb->text) {
 parser->cb->text(parser->ctx, text + text_span_start, len - text_span_start);
 }
}
static size_t get_span_em(const char* text, size_t len,
 size_t *span_start, size_t *span_len) {
 size_t curr;
 if (text[0] != '*') return 0;
 for (curr = 1; curr < len; curr++) {
 if (text[curr] == '*') {
 *span_start = 1;
 *span_len = curr - 1;
 return curr + 1;
 }
 }
 return 0;
}
static size_t parse_span_em(const char *span, size_t len,
 struct markdown_parser *parser) {
 if (parser->cb->em_begin) {
 parser->cb->em_begin(parser->ctx);
 }
 parse_inline(span, len, parser);
 if (parser->cb->em_end) {
 parser->cb->em_end(parser->ctx);
 }
}
static size_t get_span_i(const char* text, size_t len,
 size_t *span_start, size_t *span_len) {
 size_t curr;
 if (text[0] != '_') return 0;
 for (curr = 1; curr < len; curr++) {
 if (text[curr] == '_') {
 *span_start = 1;
 *span_len = curr - 1;
 return curr + 1;
 }
 }
 return 0;
}
static size_t parse_span_i(const char *span, size_t len,
 struct markdown_parser *parser) {
 if (parser->cb->i_begin) {
 parser->cb->i_begin(parser->ctx);
 }
 parse_inline(span, len, parser);
 if (parser->cb->i_end) {
 parser->cb->i_end(parser->ctx);
 }
}
void register_block_parselet(struct markdown_parser *parser, struct block_parselet p) {
 if (parser->block_parselets_capacity >= parser->block_parselets_len) {
 size_t new_capacity = parser->block_parselets_capacity == 0 ? 4 :
 parser->block_parselets_capacity * 2;
 parser->block_parselets_capacity = new_capacity;
 parser->block_parselets = realloc(parser->block_parselets,
 parser->block_parselets_capacity * sizeof(p));
 }
 parser->block_parselets[parser->block_parselets_len++] = p;
}
static int is_blank(const char* line, size_t len) {
 for (size_t i = 0; i < len; i++) {
 if (line[i] != ' ' && line[i] != '\t') return 0;
 }
 return 1;
}
static int get_heading_level(const char* line, size_t len) {
 size_t i = 0;
 int level = 0;
 while (i < len && line[i] == '#' && level < 6) {
 i++;
 level++;
 }
 if (level == 0 || i >= len || (line[i] != ' ' && line[i] != '\t')) {
 return 0;
 }
 return level;
}
static int match_blank(const char* line, size_t len) {
 return is_blank(line, len);
}
static void parse_blank(struct markdown_parser *parser) {
 // UNUSED
 (void)parser;
 line_stream_advance(&parser->line_stream);
}
static int match_heading(const char* line, size_t len) {
 return get_heading_level(line, len) > 0;
}
static void parse_heading(struct markdown_parser *parser) {
 const char* line;
 size_t len;
 line_stream_get_current(&parser->line_stream, &line, &len);
 int level = get_heading_level(line, len);
 if (parser->cb->h_begin) parser->cb->h_begin(parser->ctx, level);
 size_t i = (size_t)level;
 while (i < len && (line[i] == ' ' || line[i] == '\t')) i++;
 parse_inline(line + i, len - i, parser);
 if (parser->cb->h_end) parser->cb->h_end(parser->ctx);
 line_stream_advance(&parser->line_stream);
}
static int match_ul(const char *line, size_t len) {
 return len > 1 &&
 (*line == '*' || *line == '-') &&
 line[1] == ' ';
}
static void parse_ul(struct markdown_parser *parser) {
 const char *line;
 size_t len;
 if (parser->cb->ul_begin) parser->cb->ul_begin(parser->ctx);
 line_stream_get_current(&parser->line_stream, &line, &len);
 while (len > 1 &&
 (*line == '*' || *line == '-') &&
 line[1] == ' ') {
 if (parser->cb->li_begin) parser->cb->li_begin(parser->ctx);
 parse_inline(line + 2, len - 2, parser);
 if (parser->cb->li_end) parser->cb->li_end(parser->ctx);
 line_stream_advance(&parser->line_stream);
 line_stream_get_current(&parser->line_stream, &line, &len);
 }
 if (parser->cb->ul_end) parser->cb->ul_end(parser->ctx);
}
static void parse_paragraph(struct markdown_parser *parser) {
 const char* line;
 size_t len;
 line_stream_get_current(&parser->line_stream, &line, &len);
 size_t para_start = line - parser->line_stream.buffer, para_end;
 if (parser->cb->p_begin) parser->cb->p_begin(parser->ctx);
 while (line_stream_has_next(&parser->line_stream)) {
 line_stream_advance(&parser->line_stream);
 line_stream_get_current(&parser->line_stream, &line, &len);
 for (size_t i = 0; i < parser->block_parselets_len; i++) {
 if (parser->block_parselets[i].matches(line, len)) {
 goto end_para;
 }
 }
 }
end_para:
 para_end = line - parser->line_stream.buffer;
 parse_inline(parser->line_stream.buffer + para_start, para_end - para_start, parser);
 if (parser->cb->p_end) parser->cb->p_end(parser->ctx);
}
void parse_markdown(struct markdown_parser *parser) {
 const char* line;
 size_t len;
 line_stream_advance(&parser->line_stream);
 while (line_stream_has_next(&parser->line_stream)) {
 line_stream_get_current(&parser->line_stream, &line, &len);
 int matched = 0;
 for (size_t i = 0; i < parser->block_parselets_len; i++) {
 if (parser->block_parselets[i].matches(line, len)) {
 parser->block_parselets[i].parse(parser);
 matched = 1;
 break;
 }
 }
 if (!matched) {
 parse_paragraph(parser);
 }
 }
}
static const struct block_parselet CORE_BLOCK_BLANK = (struct block_parselet){
 .matches = match_blank,
 .parse = parse_blank
};
static const struct block_parselet CORE_BLOCK_HEADING = (struct block_parselet){
 .matches = match_heading,
 .parse = parse_heading
};
static const struct block_parselet CORE_BLOCK_UL = (struct block_parselet){
 .matches = match_ul,
 .parse = parse_ul
};
static const struct inline_parselet CORE_INLINE_EM = (struct inline_parselet){
 .get_span = get_span_em,
 .parse_span = parse_span_em
};
static const struct inline_parselet CORE_INLINE_I = (struct inline_parselet){
 .get_span = get_span_i,
 .parse_span = parse_span_i
};
static void register_core_parselets(struct markdown_parser *parser) {
 register_block_parselet(parser, CORE_BLOCK_BLANK);
 register_block_parselet(parser, CORE_BLOCK_HEADING);
 register_block_parselet(parser, CORE_BLOCK_UL);
 register_inline_parselet(parser, CORE_INLINE_EM);
 register_inline_parselet(parser, CORE_INLINE_I);
}
struct markdown_parser make_markdown_parser(const char *buffer, size_t length,
 const struct markdown_callbacks *cb,
 void *ctx) {
 struct markdown_parser parser = (struct markdown_parser){
 .cb = cb,
 .ctx = ctx,
 .line_stream = make_line_stream(buffer, length),
 .inline_parselets = NULL,
 .inline_parselets_len = 0,
 .inline_parselets_capacity = 0,
 .block_parselets = NULL,
 .block_parselets_len = 0,
 .block_parselets_capacity = 0,
 };
 register_core_parselets(&parser);
 return parser; 
}

I have also written a sample consumer that uses this parser to parse MD and output HTML.

md2html.c

#include <stddef.h>
#include <stdio.h>
#include <string.h>
#include <stdbool.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
#include "markdown-parser.h"
struct ctx {
 int indent;
 int h_level;
};
static void log_indent(int level) {
 for (int i = 0; i < level; i++) printf(" ");
}
static void log_h_begin(void* ctx, int level) {
 struct ctx *c = (struct ctx*)ctx;
 log_indent(c->indent);
 c->h_level = level;
 printf("<h%d>\n", level);
 c->indent++;
}
static void log_h_end(void* ctx) {
 struct ctx *c = (struct ctx*)ctx;
 c->indent--;
 log_indent(c->indent);
 printf("</h%d>\n", c->h_level);
}
static void log_ul_begin(void *ctx) {
 struct ctx *c = (struct ctx*)ctx;
 log_indent(c->indent);
 printf("<ul>\n");
 c->indent++;
}
static void log_ul_end(void *ctx) {
 struct ctx *c = (struct ctx*)ctx;
 c->indent--;
 log_indent(c->indent);
 printf("</ul>\n");
}
static void log_li_begin(void *ctx) {
 struct ctx *c = (struct ctx*)ctx;
 log_indent(c->indent);
 printf("<li>\n");
 c->indent++;
}
static void log_li_end(void *ctx) {
 struct ctx *c = (struct ctx*)ctx;
 c->indent--;
 log_indent(c->indent);
 printf("</li>\n");
}
static void log_p_begin(void* ctx) {
 struct ctx *c = (struct ctx*)ctx;
 log_indent(c->indent);
 printf("<p>\n");
 c->indent++;
}
static void log_p_end(void* ctx) {
 struct ctx *c = (struct ctx*)ctx;
 c->indent--;
 log_indent(c->indent);
 printf("</p>\n");
}
static void log_em_begin(void* ctx) {
 struct ctx *c = (struct ctx*)ctx;
 log_indent(c->indent);
 printf("<em>\n");
 c->indent++;
}
static void log_em_end(void* ctx) {
 struct ctx *c = (struct ctx*)ctx;
 c->indent--;
 log_indent(c->indent);
 printf("</em>\n");
}
static void log_i_begin(void* ctx) {
 struct ctx *c = (struct ctx*)ctx;
 log_indent(c->indent);
 printf("<i>\n");
 c->indent++;
}
static void log_i_end(void* ctx) {
 struct ctx *c = (struct ctx*)ctx;
 c->indent--;
 log_indent(c->indent);
 printf("</i>\n");
}
static void log_text(void* ctx, const char* text, size_t len) {
 struct ctx *c = (struct ctx*)ctx;
 log_indent(c->indent);
 printf("%.*s\n", (int)len, text);
}
int main(int argc, char** argv) {
 if (argc < 2) {
 fprintf(stderr, "Usage: %s <markdown_file>\n", argv[0]);
 return 1;
 }
 const char* path = argv[1];
 int fd = open(path, O_RDONLY);
 if (fd < 0) {
 perror("open");
 return 1;
 }
 struct stat st;
 if (fstat(fd, &st) < 0) {
 perror("fstat");
 close(fd);
 return 1;
 }
 size_t length = st.st_size;
 if (length == 0) {
 fprintf(stderr, "Empty file.\n");
 close(fd);
 return 1;
 }
 const char* mapped = mmap(NULL, length, PROT_READ, MAP_PRIVATE, fd, 0);
 if (mapped == MAP_FAILED) {
 perror("mmap");
 close(fd);
 return 1;
 }
 struct markdown_callbacks cb = {
 .h_begin = log_h_begin,
 .h_end = log_h_end,
 .p_begin = log_p_begin,
 .p_end = log_p_end,
 .ul_begin = log_ul_begin,
 .ul_end = log_ul_end,
 .li_begin = log_li_begin,
 .li_end = log_li_end,
 .em_begin = log_em_begin,
 .em_end = log_em_end,
 .i_begin = log_i_begin,
 .i_end = log_i_end,
 .text = log_text,
 };
 struct ctx ctx = {
 .indent = 0,
 .h_level = 0,
 };
 struct markdown_parser parser = make_markdown_parser(mapped, length, &cb, &ctx);
 parse_markdown(&parser);
 munmap((void*)mapped, length);
 close(fd);
 return 0;
}
Maarten Bodewes
6,59920 silver badges53 bronze badges
asked Jul 27 at 16:07
\$\endgroup\$
1
  • 1
    \$\begingroup\$ Please add the review goals. \$\endgroup\$ Commented Jul 31 at 5:31

0

Know someone who can answer? Share a link to this question via email, Twitter, or Facebook.

Your Answer

Draft saved
Draft discarded

Sign up or log in

Sign up using Google
Sign up using Email and Password

Post as a guest

Required, but never shown

Post as a guest

Required, but never shown

By clicking "Post Your Answer", you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.