\$\begingroup\$
\$\endgroup\$
1
For a side project, I needed a markdown parser and I decided to roll my own.
It is a SAX-style parser, i.o.w. you can hook into parser events on consumer side and do whatever you want with the content.
markdown-parser.h
#include <stddef.h>
#include <stdio.h>
#include <string.h>
#include <stdbool.h>
struct markdown_parser;
struct block_parselet {
int (*matches)(const char* line, size_t len);
void (*parse)(struct markdown_parser *parser);
};
struct inline_parselet {
/**
* This is called for each inline parselet to determine if they can
* handle inline content starting at `text` with given length.
*
* A parselet can inspect the content for the markers it is interested
* in and should set span_start and span_len to describe the span of
* the content within these markers. It should additionally return the
* length of the text that would be consumed on parsing this span,
* including any markers.
*
* A parselet not willing to handle content at this point should return
* 0.
*/
size_t (*get_span)(const char *text, size_t len,
size_t *span_start, size_t *span_len);
/**
* This is called for each span reported by a parselet via `get_span`.
* A parselet should fire its callbacks here and optionally call
* parse_inline() if the element allows nesting inline elements.
*/
size_t (*parse_span)(const char *span, size_t span_len,
struct markdown_parser *parser);
};
struct markdown_callbacks {
void (*h_begin)(void *ctx, int level);
void (*h_end)(void *ctx);
void (*p_begin)(void *ctx);
void (*p_end)(void *ctx);
void (*ul_begin)(void *ctx);
void (*ul_end)(void *ctx);
void (*li_begin)(void *ctx);
void (*li_end)(void *ctx);
void (*text)(void* ctx, const char* text, size_t len);
void (*em_begin)(void* ctx);
void (*em_end)(void* ctx);
void (*i_begin)(void* ctx);
void (*i_end)(void* ctx);
};
struct line_stream {
const char* buffer;
size_t length;
size_t curr;
const char* current_line;
size_t current_len;
};
struct markdown_parser {
const struct markdown_callbacks *cb;
void *ctx;
struct line_stream line_stream;
struct inline_parselet* inline_parselets;
size_t inline_parselets_len;
size_t inline_parselets_capacity;
struct block_parselet* block_parselets;
size_t block_parselets_len;
size_t block_parselets_capacity;
};
struct markdown_parser make_markdown_parser(const char *buffer, size_t length,
const struct markdown_callbacks *cb,
void *ctx);
void parse_markdown(struct markdown_parser *parser);
markdown-parser.c:
#include <stdlib.h>
#include "markdown-parser.h"
static struct line_stream make_line_stream(const char *buffer, size_t length) {
return (struct line_stream){
.buffer = buffer,
.length = length,
.curr = 0,
.current_line = NULL,
.current_len = 0
};
}
static int line_stream_has_next(const struct line_stream* s) {
return s->curr < s->length;
}
static void line_stream_advance(struct line_stream* s) {
if (s->curr >= s->length) {
s->current_line = s->buffer + s->length;
s->current_len = 0;
return;
}
size_t start = s->curr;
size_t end = start;
while (end < s->length && s->buffer[end] != '\n' && s->buffer[end] != '\r') {
end++;
}
s->current_line = s->buffer + start;
s->current_len = end - start;
if (end < s->length) {
if (s->buffer[end] == '\r' && end + 1 < s->length && s->buffer[end + 1] == '\n') {
s->curr = end + 2;
} else {
s->curr = end + 1;
}
} else {
s->curr = end;
}
}
static void line_stream_get_current(const struct line_stream* s, const char** out_line,
size_t* out_len) {
*out_line = s->current_line;
*out_len = s->current_len;
}
static void register_inline_parselet(struct markdown_parser *parser, struct inline_parselet p) {
if (parser->inline_parselets_capacity >= parser->inline_parselets_len) {
size_t new_capacity = parser->inline_parselets_capacity == 0 ? 4 :
parser->inline_parselets_capacity * 2;
parser->inline_parselets_capacity = new_capacity;
parser->inline_parselets = realloc(parser->inline_parselets,
parser->inline_parselets_capacity * sizeof(p));
}
parser->inline_parselets[parser->inline_parselets_len++] = p;
}
static void parse_inline(const char* text, size_t len,
struct markdown_parser* parser) {
size_t i = 0;
size_t text_span_start = 0;
while (i < len) {
bool matched = false;
for (size_t j = 0; j < parser->inline_parselets_len; j++) {
struct inline_parselet *parselet = &parser->inline_parselets[j];
size_t inline_span_start, inline_span_len;
size_t consumed = parselet->get_span(text + i, len - i, &inline_span_start,
&inline_span_len);
if (consumed == 0) {
continue;
}
/* Flush accumulated text before this inline span */
if (i > text_span_start && parser->cb->text) {
parser->cb->text(parser->ctx, text + text_span_start, i - text_span_start);
}
parselet->parse_span(text + i + inline_span_start, inline_span_len,
parser);
i += consumed;
text_span_start = i;
matched = true;
break;
}
if (!matched) {
i++;
}
}
/* Flush left over accululated text */
if (text_span_start < len && parser->cb->text) {
parser->cb->text(parser->ctx, text + text_span_start, len - text_span_start);
}
}
static size_t get_span_em(const char* text, size_t len,
size_t *span_start, size_t *span_len) {
size_t curr;
if (text[0] != '*') return 0;
for (curr = 1; curr < len; curr++) {
if (text[curr] == '*') {
*span_start = 1;
*span_len = curr - 1;
return curr + 1;
}
}
return 0;
}
static size_t parse_span_em(const char *span, size_t len,
struct markdown_parser *parser) {
if (parser->cb->em_begin) {
parser->cb->em_begin(parser->ctx);
}
parse_inline(span, len, parser);
if (parser->cb->em_end) {
parser->cb->em_end(parser->ctx);
}
}
static size_t get_span_i(const char* text, size_t len,
size_t *span_start, size_t *span_len) {
size_t curr;
if (text[0] != '_') return 0;
for (curr = 1; curr < len; curr++) {
if (text[curr] == '_') {
*span_start = 1;
*span_len = curr - 1;
return curr + 1;
}
}
return 0;
}
static size_t parse_span_i(const char *span, size_t len,
struct markdown_parser *parser) {
if (parser->cb->i_begin) {
parser->cb->i_begin(parser->ctx);
}
parse_inline(span, len, parser);
if (parser->cb->i_end) {
parser->cb->i_end(parser->ctx);
}
}
void register_block_parselet(struct markdown_parser *parser, struct block_parselet p) {
if (parser->block_parselets_capacity >= parser->block_parselets_len) {
size_t new_capacity = parser->block_parselets_capacity == 0 ? 4 :
parser->block_parselets_capacity * 2;
parser->block_parselets_capacity = new_capacity;
parser->block_parselets = realloc(parser->block_parselets,
parser->block_parselets_capacity * sizeof(p));
}
parser->block_parselets[parser->block_parselets_len++] = p;
}
static int is_blank(const char* line, size_t len) {
for (size_t i = 0; i < len; i++) {
if (line[i] != ' ' && line[i] != '\t') return 0;
}
return 1;
}
static int get_heading_level(const char* line, size_t len) {
size_t i = 0;
int level = 0;
while (i < len && line[i] == '#' && level < 6) {
i++;
level++;
}
if (level == 0 || i >= len || (line[i] != ' ' && line[i] != '\t')) {
return 0;
}
return level;
}
static int match_blank(const char* line, size_t len) {
return is_blank(line, len);
}
static void parse_blank(struct markdown_parser *parser) {
// UNUSED
(void)parser;
line_stream_advance(&parser->line_stream);
}
static int match_heading(const char* line, size_t len) {
return get_heading_level(line, len) > 0;
}
static void parse_heading(struct markdown_parser *parser) {
const char* line;
size_t len;
line_stream_get_current(&parser->line_stream, &line, &len);
int level = get_heading_level(line, len);
if (parser->cb->h_begin) parser->cb->h_begin(parser->ctx, level);
size_t i = (size_t)level;
while (i < len && (line[i] == ' ' || line[i] == '\t')) i++;
parse_inline(line + i, len - i, parser);
if (parser->cb->h_end) parser->cb->h_end(parser->ctx);
line_stream_advance(&parser->line_stream);
}
static int match_ul(const char *line, size_t len) {
return len > 1 &&
(*line == '*' || *line == '-') &&
line[1] == ' ';
}
static void parse_ul(struct markdown_parser *parser) {
const char *line;
size_t len;
if (parser->cb->ul_begin) parser->cb->ul_begin(parser->ctx);
line_stream_get_current(&parser->line_stream, &line, &len);
while (len > 1 &&
(*line == '*' || *line == '-') &&
line[1] == ' ') {
if (parser->cb->li_begin) parser->cb->li_begin(parser->ctx);
parse_inline(line + 2, len - 2, parser);
if (parser->cb->li_end) parser->cb->li_end(parser->ctx);
line_stream_advance(&parser->line_stream);
line_stream_get_current(&parser->line_stream, &line, &len);
}
if (parser->cb->ul_end) parser->cb->ul_end(parser->ctx);
}
static void parse_paragraph(struct markdown_parser *parser) {
const char* line;
size_t len;
line_stream_get_current(&parser->line_stream, &line, &len);
size_t para_start = line - parser->line_stream.buffer, para_end;
if (parser->cb->p_begin) parser->cb->p_begin(parser->ctx);
while (line_stream_has_next(&parser->line_stream)) {
line_stream_advance(&parser->line_stream);
line_stream_get_current(&parser->line_stream, &line, &len);
for (size_t i = 0; i < parser->block_parselets_len; i++) {
if (parser->block_parselets[i].matches(line, len)) {
goto end_para;
}
}
}
end_para:
para_end = line - parser->line_stream.buffer;
parse_inline(parser->line_stream.buffer + para_start, para_end - para_start, parser);
if (parser->cb->p_end) parser->cb->p_end(parser->ctx);
}
void parse_markdown(struct markdown_parser *parser) {
const char* line;
size_t len;
line_stream_advance(&parser->line_stream);
while (line_stream_has_next(&parser->line_stream)) {
line_stream_get_current(&parser->line_stream, &line, &len);
int matched = 0;
for (size_t i = 0; i < parser->block_parselets_len; i++) {
if (parser->block_parselets[i].matches(line, len)) {
parser->block_parselets[i].parse(parser);
matched = 1;
break;
}
}
if (!matched) {
parse_paragraph(parser);
}
}
}
static const struct block_parselet CORE_BLOCK_BLANK = (struct block_parselet){
.matches = match_blank,
.parse = parse_blank
};
static const struct block_parselet CORE_BLOCK_HEADING = (struct block_parselet){
.matches = match_heading,
.parse = parse_heading
};
static const struct block_parselet CORE_BLOCK_UL = (struct block_parselet){
.matches = match_ul,
.parse = parse_ul
};
static const struct inline_parselet CORE_INLINE_EM = (struct inline_parselet){
.get_span = get_span_em,
.parse_span = parse_span_em
};
static const struct inline_parselet CORE_INLINE_I = (struct inline_parselet){
.get_span = get_span_i,
.parse_span = parse_span_i
};
static void register_core_parselets(struct markdown_parser *parser) {
register_block_parselet(parser, CORE_BLOCK_BLANK);
register_block_parselet(parser, CORE_BLOCK_HEADING);
register_block_parselet(parser, CORE_BLOCK_UL);
register_inline_parselet(parser, CORE_INLINE_EM);
register_inline_parselet(parser, CORE_INLINE_I);
}
struct markdown_parser make_markdown_parser(const char *buffer, size_t length,
const struct markdown_callbacks *cb,
void *ctx) {
struct markdown_parser parser = (struct markdown_parser){
.cb = cb,
.ctx = ctx,
.line_stream = make_line_stream(buffer, length),
.inline_parselets = NULL,
.inline_parselets_len = 0,
.inline_parselets_capacity = 0,
.block_parselets = NULL,
.block_parselets_len = 0,
.block_parselets_capacity = 0,
};
register_core_parselets(&parser);
return parser;
}
I have also written a sample consumer that uses this parser to parse MD and output HTML.
md2html.c
#include <stddef.h>
#include <stdio.h>
#include <string.h>
#include <stdbool.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
#include "markdown-parser.h"
struct ctx {
int indent;
int h_level;
};
static void log_indent(int level) {
for (int i = 0; i < level; i++) printf(" ");
}
static void log_h_begin(void* ctx, int level) {
struct ctx *c = (struct ctx*)ctx;
log_indent(c->indent);
c->h_level = level;
printf("<h%d>\n", level);
c->indent++;
}
static void log_h_end(void* ctx) {
struct ctx *c = (struct ctx*)ctx;
c->indent--;
log_indent(c->indent);
printf("</h%d>\n", c->h_level);
}
static void log_ul_begin(void *ctx) {
struct ctx *c = (struct ctx*)ctx;
log_indent(c->indent);
printf("<ul>\n");
c->indent++;
}
static void log_ul_end(void *ctx) {
struct ctx *c = (struct ctx*)ctx;
c->indent--;
log_indent(c->indent);
printf("</ul>\n");
}
static void log_li_begin(void *ctx) {
struct ctx *c = (struct ctx*)ctx;
log_indent(c->indent);
printf("<li>\n");
c->indent++;
}
static void log_li_end(void *ctx) {
struct ctx *c = (struct ctx*)ctx;
c->indent--;
log_indent(c->indent);
printf("</li>\n");
}
static void log_p_begin(void* ctx) {
struct ctx *c = (struct ctx*)ctx;
log_indent(c->indent);
printf("<p>\n");
c->indent++;
}
static void log_p_end(void* ctx) {
struct ctx *c = (struct ctx*)ctx;
c->indent--;
log_indent(c->indent);
printf("</p>\n");
}
static void log_em_begin(void* ctx) {
struct ctx *c = (struct ctx*)ctx;
log_indent(c->indent);
printf("<em>\n");
c->indent++;
}
static void log_em_end(void* ctx) {
struct ctx *c = (struct ctx*)ctx;
c->indent--;
log_indent(c->indent);
printf("</em>\n");
}
static void log_i_begin(void* ctx) {
struct ctx *c = (struct ctx*)ctx;
log_indent(c->indent);
printf("<i>\n");
c->indent++;
}
static void log_i_end(void* ctx) {
struct ctx *c = (struct ctx*)ctx;
c->indent--;
log_indent(c->indent);
printf("</i>\n");
}
static void log_text(void* ctx, const char* text, size_t len) {
struct ctx *c = (struct ctx*)ctx;
log_indent(c->indent);
printf("%.*s\n", (int)len, text);
}
int main(int argc, char** argv) {
if (argc < 2) {
fprintf(stderr, "Usage: %s <markdown_file>\n", argv[0]);
return 1;
}
const char* path = argv[1];
int fd = open(path, O_RDONLY);
if (fd < 0) {
perror("open");
return 1;
}
struct stat st;
if (fstat(fd, &st) < 0) {
perror("fstat");
close(fd);
return 1;
}
size_t length = st.st_size;
if (length == 0) {
fprintf(stderr, "Empty file.\n");
close(fd);
return 1;
}
const char* mapped = mmap(NULL, length, PROT_READ, MAP_PRIVATE, fd, 0);
if (mapped == MAP_FAILED) {
perror("mmap");
close(fd);
return 1;
}
struct markdown_callbacks cb = {
.h_begin = log_h_begin,
.h_end = log_h_end,
.p_begin = log_p_begin,
.p_end = log_p_end,
.ul_begin = log_ul_begin,
.ul_end = log_ul_end,
.li_begin = log_li_begin,
.li_end = log_li_end,
.em_begin = log_em_begin,
.em_end = log_em_end,
.i_begin = log_i_begin,
.i_end = log_i_end,
.text = log_text,
};
struct ctx ctx = {
.indent = 0,
.h_level = 0,
};
struct markdown_parser parser = make_markdown_parser(mapped, length, &cb, &ctx);
parse_markdown(&parser);
munmap((void*)mapped, length);
close(fd);
return 0;
}
Maarten Bodewes
6,59920 silver badges53 bronze badges
-
1\$\begingroup\$ Please add the review goals. \$\endgroup\$chux– chux2025年07月31日 05:31:25 +00:00Commented Jul 31 at 5:31
lang-c