Merge remote-tracking branch 'github/master' into vmitchell/sync-upstream-gfm.7

rdar://104622655
This commit is contained in:
Victoria Mitchell
2023-01-24 15:58:55 -07:00
32 changed files with 14723 additions and 10710 deletions

1
.gitignore vendored
View File

@@ -33,6 +33,7 @@ build
cmark.dSYM/*
cmark
.vscode
.DS_Store
# Testing and benchmark
alltests.md

View File

@@ -31,6 +31,16 @@ set(CMAKE_C_STANDARD_REQUIRED YES)
# Use CMake's generated headers instead of the Swift package prebuilt ones
add_compile_definitions(CMARK_USE_CMAKE_HEADERS)
option(CMARK_FUZZ_QUADRATIC "Build quadratic fuzzing harness" OFF)
if(CMARK_FUZZ_QUADRATIC)
set(FUZZER_FLAGS "-fsanitize=fuzzer-no-link,address -g")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${FUZZER_FLAGS}")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${FUZZER_FLAGS}")
set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${FUZZER_FLAGS}")
set(CMAKE_MODULE_LINKER_FLAGS "${CMAKE_MODULE_LINKER_FLAGS} ${FUZZER_FLAGS}")
endif()
add_subdirectory(src)
add_subdirectory(extensions)
if(CMARK_TESTS AND (CMARK_SHARED OR CMARK_STATIC))
@@ -41,6 +51,9 @@ if(CMARK_TESTS)
enable_testing()
add_subdirectory(test testdir)
endif()
if(CMARK_FUZZ_QUADRATIC)
add_subdirectory(fuzz)
endif()
if(NOT CMAKE_BUILD_TYPE)
set(CMAKE_BUILD_TYPE "Release" CACHE STRING

View File

@@ -22,7 +22,7 @@ VERSION?=$(SPECVERSION)
RELEASE?=CommonMark-$(VERSION)
INSTALL_PREFIX?=/usr/local
CLANG_CHECK?=clang-check
CLANG_FORMAT=clang-format-3.5 -style llvm -sort-includes=0 -i
CLANG_FORMAT=clang-format -style llvm -sort-includes=0 -i
AFL_PATH?=/usr/local/bin
.PHONY: all cmake_build leakcheck clean fuzztest test debug ubsan asan mingw archive newbench bench format update-spec afl clang-check docker libFuzzer
@@ -140,7 +140,7 @@ $(EXTDIR)/ext_scanners.c: $(EXTDIR)/ext_scanners.re
esac
re2c --case-insensitive -b -i --no-generation-date -8 \
--encoding-policy substitute -o $@ $<
clang-format-3.5 -style llvm -i $@
clang-format -style llvm -i $@
# We include entities.inc in the repository, so normally this
# doesn't need to be regenerated:
@@ -211,7 +211,7 @@ format:
$(CLANG_FORMAT) src/*.c src/*.h api_test/*.c api_test/*.h
format-extensions:
clang-format-3.5 -style llvm -i extensions/*.c extensions/*.h
clang-format -style llvm -i extensions/*.c extensions/*.h
operf: $(CMARK)
operf $< < $(BENCHFILE) > /dev/null

View File

@@ -1575,6 +1575,7 @@ int main() {
int retval;
test_batch_runner *runner = test_batch_runner_new();
cmark_init_standard_node_flags();
version(runner);
constructor(runner);
accessors(runner);

View File

@@ -143,6 +143,7 @@ int main(int argc, char *argv[]) {
}
#endif
cmark_init_standard_node_flags();
cmark_gfm_core_extensions_ensure_registered();
#ifdef USE_PLEDGE

View File

@@ -1,3 +1,20 @@
[0.29.0.gfm.7]
* Fixed a polynomial time complexity issue per
https://github.com/github/cmark-gfm/security/advisories/GHSA-r572-jvj2-3m8p
* Fixed an issue in which crafted markdown document could trigger an
out-of-bounds read in the validate_protocol function per
https://github.com/github/cmark-gfm/security/advisories/GHSA-c944-cv5f-hpvr
* Fixed a polynomial time complexity issue
https://github.com/github/cmark-gfm/security/advisories/GHSA-24f7-9frr-5h2r
* Fixed several polynomial time complexity issues per
https://github.com/github/cmark-gfm/security/advisories/GHSA-29g3-96g3-jg6c
* We removed an unneeded .DS_Store file (#291)
* We added a test for domains with underscores and fix roundtrip behavior (#292)
* We now use an up-to-date clang-format (#294)
* We made a variety of implicit integer trunctions explicit by moving to
size_t as our standard size integer type (#302)
[0.29.0.gfm.6]
* Fixed polynomial time complexity DoS vulnerability in autolink extension

View File

@@ -2,6 +2,7 @@
#include <parser.h>
#include <string.h>
#include <utf8.h>
#include <stddef.h>
#if defined(_WIN32)
#define strncasecmp _strnicmp
@@ -35,30 +36,63 @@ static int sd_autolink_issafe(const uint8_t *link, size_t link_len) {
}
static size_t autolink_delim(uint8_t *data, size_t link_end) {
uint8_t cclose, copen;
size_t i;
size_t closing = 0;
size_t opening = 0;
for (i = 0; i < link_end; ++i)
if (data[i] == '<') {
for (i = 0; i < link_end; ++i) {
const uint8_t c = data[i];
if (c == '<') {
link_end = i;
break;
} else if (c == '(') {
opening++;
} else if (c == ')') {
closing++;
}
}
while (link_end > 0) {
cclose = data[link_end - 1];
switch (cclose) {
switch (data[link_end - 1]) {
case ')':
copen = '(';
break;
default:
copen = 0;
}
if (strchr("?!.,:*_~'\"", data[link_end - 1]) != NULL)
/* Allow any number of matching brackets (as recognised in copen/cclose)
* at the end of the URL. If there is a greater number of closing
* brackets than opening ones, we remove one character from the end of
* the link.
*
* Examples (input text => output linked portion):
*
* http://www.pokemon.com/Pikachu_(Electric)
* => http://www.pokemon.com/Pikachu_(Electric)
*
* http://www.pokemon.com/Pikachu_((Electric)
* => http://www.pokemon.com/Pikachu_((Electric)
*
* http://www.pokemon.com/Pikachu_(Electric))
* => http://www.pokemon.com/Pikachu_(Electric)
*
* http://www.pokemon.com/Pikachu_((Electric))
* => http://www.pokemon.com/Pikachu_((Electric))
*/
if (closing <= opening) {
return link_end;
}
closing--;
link_end--;
else if (data[link_end - 1] == ';') {
break;
case '?':
case '!':
case '.':
case ',':
case ':':
case '*':
case '_':
case '~':
case '\'':
case '"':
link_end--;
break;
case ';': {
size_t new_end = link_end - 2;
while (new_end > 0 && cmark_isalpha(data[new_end]))
@@ -68,46 +102,12 @@ static size_t autolink_delim(uint8_t *data, size_t link_end) {
link_end = new_end;
else
link_end--;
} else if (copen != 0) {
size_t closing = 0;
size_t opening = 0;
i = 0;
/* Allow any number of matching brackets (as recognised in copen/cclose)
* at the end of the URL. If there is a greater number of closing
* brackets than opening ones, we remove one character from the end of
* the link.
*
* Examples (input text => output linked portion):
*
* http://www.pokemon.com/Pikachu_(Electric)
* => http://www.pokemon.com/Pikachu_(Electric)
*
* http://www.pokemon.com/Pikachu_((Electric)
* => http://www.pokemon.com/Pikachu_((Electric)
*
* http://www.pokemon.com/Pikachu_(Electric))
* => http://www.pokemon.com/Pikachu_(Electric)
*
* http://www.pokemon.com/Pikachu_((Electric))
* => http://www.pokemon.com/Pikachu_((Electric))
*/
while (i < link_end) {
if (data[i] == copen)
opening++;
else if (data[i] == cclose)
closing++;
i++;
}
if (closing <= opening)
break;
link_end--;
} else
break;
}
default:
return link_end;
}
}
return link_end;
@@ -116,7 +116,20 @@ static size_t autolink_delim(uint8_t *data, size_t link_end) {
static size_t check_domain(uint8_t *data, size_t size, int allow_short) {
size_t i, np = 0, uscore1 = 0, uscore2 = 0;
/* The purpose of this code is to reject urls that contain an underscore
* in one of the last two segments. Examples:
*
* www.xxx.yyy.zzz autolinked
* www.xxx.yyy._zzz not autolinked
* www.xxx._yyy.zzz not autolinked
* www._xxx.yyy.zzz autolinked
*
* The reason is that domain names are allowed to include underscores,
* but host names are not. See: https://stackoverflow.com/a/2183140
*/
for (i = 1; i < size - 1; i++) {
if (data[i] == '\\' && i < size - 2)
i++;
if (data[i] == '_')
uscore2++;
else if (data[i] == '.') {
@@ -127,8 +140,17 @@ static size_t check_domain(uint8_t *data, size_t size, int allow_short) {
break;
}
if (uscore1 > 0 || uscore2 > 0)
return 0;
if (uscore1 > 0 || uscore2 > 0) {
/* If the url is very long then accept it despite the underscores,
* to avoid quadratic behavior causing a denial of service. See:
* https://github.com/github/cmark-gfm/security/advisories/GHSA-29g3-96g3-jg6c
* Reasonable urls are unlikely to have more than 10 segments, so
* this extra condition shouldn't have any impact on normal usage.
*/
if (np <= 10) {
return 0;
}
}
if (allow_short) {
/* We don't need a valid domain in the strict sense (with
@@ -165,7 +187,7 @@ static cmark_node *www_match(cmark_parser *parser, cmark_node *parent,
if (link_end == 0)
return NULL;
while (link_end < size && !cmark_isspace(data[link_end]))
while (link_end < size && !cmark_isspace(data[link_end]) && data[link_end] != '<')
link_end++;
link_end = autolink_delim(data, link_end);
@@ -225,7 +247,7 @@ static cmark_node *url_match(cmark_parser *parser, cmark_node *parent,
return 0;
link_end += domain_len;
while (link_end < size && !cmark_isspace(data[link_end]))
while (link_end < size && !cmark_isspace(data[link_end]) && data[link_end] != '<')
link_end++;
link_end = autolink_delim(data, link_end);
@@ -269,142 +291,167 @@ static cmark_node *match(cmark_syntax_extension *ext, cmark_parser *parser,
// inline was finished in inlines.c.
}
static bool validate_protocol(char protocol[], uint8_t *data, int rewind) {
static bool validate_protocol(char protocol[], uint8_t *data, size_t rewind, size_t max_rewind) {
size_t len = strlen(protocol);
// Check that the protocol matches
for (int i = 1; i <= len; i++) {
if (data[-rewind - i] != protocol[len - i]) {
return false;
}
if (len > (max_rewind - rewind)) {
return false;
}
char prev_char = data[-rewind - len - 1];
// Check that the protocol matches
if (memcmp(data - rewind - len, protocol, len) != 0) {
return false;
}
if (len == (max_rewind - rewind)) {
return true;
}
char prev_char = data[-((ptrdiff_t)rewind) - len - 1];
// Make sure the character before the protocol is non-alphanumeric
return !cmark_isalnum(prev_char);
}
static void postprocess_text(cmark_parser *parser, cmark_node *text, int offset, int depth) {
// postprocess_text can recurse very deeply if there is a very long line of
// '@' only. Stop at a reasonable depth to ensure it cannot crash.
if (depth > 1000) return;
static void postprocess_text(cmark_parser *parser, cmark_node *text) {
size_t start = 0;
size_t offset = 0;
// `text` is going to be split into a list of nodes containing shorter segments
// of text, so we detach the memory buffer from text and use `cmark_chunk_dup` to
// create references to it. Later, `cmark_chunk_to_cstr` is used to convert
// the references into allocated buffers. The detached buffer is freed before we
// return.
cmark_chunk detached_chunk = text->as.literal;
text->as.literal = cmark_chunk_dup(&detached_chunk, 0, detached_chunk.len);
size_t link_end;
uint8_t *data = text->as.literal.data,
*at;
size_t size = text->as.literal.len;
bool auto_mailto = true;
bool is_xmpp = false;
int rewind, max_rewind,
nb = 0, np = 0, ns = 0;
uint8_t *data = text->as.literal.data;
size_t remaining = text->as.literal.len;
if (offset < 0 || (size_t)offset >= size)
return;
while (true) {
size_t link_end;
uint8_t *at;
bool auto_mailto = true;
bool is_xmpp = false;
size_t rewind;
size_t max_rewind;
size_t np = 0;
data += offset;
size -= offset;
if (offset >= remaining)
break;
at = (uint8_t *)memchr(data, '@', size);
if (!at)
return;
at = (uint8_t *)memchr(data + start + offset, '@', remaining - offset);
if (!at)
break;
max_rewind = (int)(at - data);
data += max_rewind;
size -= max_rewind;
max_rewind = at - (data + start + offset);
for (rewind = 0; rewind < max_rewind; ++rewind) {
uint8_t c = data[-rewind - 1];
found_at:
for (rewind = 0; rewind < max_rewind; ++rewind) {
uint8_t c = data[start + offset + max_rewind - rewind - 1];
if (cmark_isalnum(c))
continue;
if (strchr(".+-_", c) != NULL)
continue;
if (strchr(":", c) != NULL) {
if (validate_protocol("mailto:", data, rewind)) {
auto_mailto = false;
if (cmark_isalnum(c))
continue;
if (strchr(".+-_", c) != NULL)
continue;
if (strchr(":", c) != NULL) {
if (validate_protocol("mailto:", data + start + offset + max_rewind, rewind, max_rewind)) {
auto_mailto = false;
continue;
}
if (validate_protocol("xmpp:", data + start + offset + max_rewind, rewind, max_rewind)) {
auto_mailto = false;
is_xmpp = true;
continue;
}
}
if (validate_protocol("xmpp:", data, rewind)) {
auto_mailto = false;
is_xmpp = true;
continue;
}
break;
}
break;
}
if (rewind == 0 || ns > 0) {
postprocess_text(parser, text, max_rewind + 1 + offset, depth + 1);
return;
}
for (link_end = 0; link_end < size; ++link_end) {
uint8_t c = data[link_end];
if (cmark_isalnum(c))
if (rewind == 0) {
offset += max_rewind + 1;
continue;
}
if (c == '@')
nb++;
else if (c == '.' && link_end < size - 1 && cmark_isalnum(data[link_end + 1]))
np++;
else if (c == '/' && is_xmpp)
assert(data[start + offset + max_rewind] == '@');
for (link_end = 1; link_end < remaining - offset - max_rewind; ++link_end) {
uint8_t c = data[start + offset + max_rewind + link_end];
if (cmark_isalnum(c))
continue;
if (c == '@') {
// Found another '@', so go back and try again with an updated offset and max_rewind.
offset += max_rewind + 1;
max_rewind = link_end - 1;
goto found_at;
} else if (c == '.' && link_end < remaining - offset - max_rewind - 1 &&
cmark_isalnum(data[start + offset + max_rewind + link_end + 1]))
np++;
else if (c == '/' && is_xmpp)
continue;
else if (c != '-' && c != '_')
break;
}
if (link_end < 2 || np == 0 ||
(!cmark_isalpha(data[start + offset + max_rewind + link_end - 1]) &&
data[start + offset + max_rewind + link_end - 1] != '.')) {
offset += max_rewind + link_end;
continue;
else if (c != '-' && c != '_')
break;
}
if (link_end < 2 || nb != 1 || np == 0 ||
(!cmark_isalpha(data[link_end - 1]) && data[link_end - 1] != '.')) {
postprocess_text(parser, text, max_rewind + 1 + offset, depth + 1);
return;
}
link_end = autolink_delim(data, link_end);
if (link_end == 0) {
postprocess_text(parser, text, max_rewind + 1 + offset, depth + 1);
return;
}
link_end = autolink_delim(data + start + offset + max_rewind, link_end);
if (link_end == 0) {
offset += max_rewind + 1;
continue;
}
cmark_node *link_node = cmark_node_new_with_mem(CMARK_NODE_LINK, parser->mem);
cmark_strbuf buf;
cmark_strbuf_init(parser->mem, &buf, 10);
if (auto_mailto)
cmark_strbuf_puts(&buf, "mailto:");
cmark_strbuf_put(&buf, data + start + offset + max_rewind - rewind, (bufsize_t)(link_end + rewind));
link_node->as.link.url = cmark_chunk_buf_detach(&buf);
cmark_node *link_text = cmark_node_new_with_mem(CMARK_NODE_TEXT, parser->mem);
cmark_chunk email = cmark_chunk_dup(
&detached_chunk,
(bufsize_t)(start + offset + max_rewind - rewind),
(bufsize_t)(link_end + rewind));
cmark_chunk_to_cstr(parser->mem, &email);
link_text->as.literal = email;
cmark_node_append_child(link_node, link_text);
cmark_node_insert_after(text, link_node);
cmark_node *post = cmark_node_new_with_mem(CMARK_NODE_TEXT, parser->mem);
post->as.literal = cmark_chunk_dup(&detached_chunk,
(bufsize_t)(start + offset + max_rewind + link_end),
(bufsize_t)(remaining - offset - max_rewind - link_end));
cmark_node_insert_after(link_node, post);
text->as.literal = cmark_chunk_dup(&detached_chunk, (bufsize_t)start, (bufsize_t)(offset + max_rewind - rewind));
cmark_chunk_to_cstr(parser->mem, &text->as.literal);
text = post;
start += offset + max_rewind + link_end;
remaining -= offset + max_rewind + link_end;
offset = 0;
}
// Convert the reference to allocated memory.
assert(!text->as.literal.alloc);
cmark_chunk_to_cstr(parser->mem, &text->as.literal);
cmark_node *link_node = cmark_node_new_with_mem(CMARK_NODE_LINK, parser->mem);
cmark_strbuf buf;
cmark_strbuf_init(parser->mem, &buf, 10);
if (auto_mailto)
cmark_strbuf_puts(&buf, "mailto:");
cmark_strbuf_put(&buf, data - rewind, (bufsize_t)(link_end + rewind));
link_node->as.link.url = cmark_chunk_buf_detach(&buf);
cmark_node *link_text = cmark_node_new_with_mem(CMARK_NODE_TEXT, parser->mem);
cmark_chunk email = cmark_chunk_dup(
&text->as.literal,
offset + max_rewind - rewind,
(bufsize_t)(link_end + rewind));
cmark_chunk_to_cstr(parser->mem, &email);
link_text->as.literal = email;
cmark_node_append_child(link_node, link_text);
cmark_node_insert_after(text, link_node);
cmark_node *post = cmark_node_new_with_mem(CMARK_NODE_TEXT, parser->mem);
post->as.literal = cmark_chunk_dup(&text->as.literal,
(bufsize_t)(offset + max_rewind + link_end),
(bufsize_t)(size - link_end));
cmark_chunk_to_cstr(parser->mem, &post->as.literal);
cmark_node_insert_after(link_node, post);
text->as.literal.len = offset + max_rewind - rewind;
text->as.literal.data[text->as.literal.len] = 0;
postprocess_text(parser, post, 0, depth + 1);
// Free the detached buffer.
cmark_chunk_free(parser->mem, &detached_chunk);
}
static cmark_node *postprocess(cmark_syntax_extension *ext, cmark_parser *parser, cmark_node *root) {
@@ -431,7 +478,7 @@ static cmark_node *postprocess(cmark_syntax_extension *ext, cmark_parser *parser
}
if (ev == CMARK_EVENT_ENTER && node->type == CMARK_NODE_TEXT) {
postprocess_text(parser, node, 0, /*depth*/0);
postprocess_text(parser, node);
}
}

View File

@@ -67,6 +67,7 @@ static delimiter *insert(cmark_syntax_extension *self, cmark_parser *parser,
strikethrough->end_column = closer->inl_text->start_column + closer->inl_text->as.literal.len - 1;
cmark_node_free(closer->inl_text);
done:
delim = closer;
while (delim != NULL && delim != opener) {
tmp_delim = delim->previous;
@@ -76,7 +77,6 @@ static delimiter *insert(cmark_syntax_extension *self, cmark_parser *parser,
cmark_inline_parser_remove_delimiter(inline_parser, opener);
done:
return res;
}

View File

@@ -11,24 +11,12 @@
#include "table.h"
#include "cmark-gfm-core-extensions.h"
// Custom node flag, initialized in `create_table_extension`.
static cmark_node__internal_flags CMARK_NODE__TABLE_VISITED;
cmark_node_type CMARK_NODE_TABLE, CMARK_NODE_TABLE_ROW,
CMARK_NODE_TABLE_CELL;
typedef struct {
uint16_t n_columns;
int paragraph_offset;
cmark_llist *cells;
} table_row;
typedef struct {
uint16_t n_columns;
uint8_t *alignments;
} node_table;
typedef struct {
bool is_header;
} node_table_row;
typedef struct {
unsigned colspan, rowspan;
} node_cell_data;
@@ -39,21 +27,41 @@ typedef struct {
node_cell_data *cell_data;
} node_cell;
static void free_table_cell(cmark_mem *mem, void *data) {
node_cell *cell = (node_cell *)data;
typedef struct {
uint16_t n_columns;
int paragraph_offset;
node_cell *cells;
} table_row;
typedef struct {
uint16_t n_columns;
uint8_t *alignments;
} node_table;
typedef struct {
bool is_header;
} node_table_row;
static void free_table_cell(cmark_mem *mem, node_cell *cell) {
cmark_strbuf_free((cmark_strbuf *)cell->buf);
mem->free(cell->buf);
if (cell->cell_data)
mem->free(cell->cell_data);
mem->free(cell);
}
static void free_row_cells(cmark_mem *mem, table_row *row) {
while (row->n_columns > 0) {
free_table_cell(mem, &row->cells[--row->n_columns]);
}
mem->free(row->cells);
row->cells = NULL;
}
static void free_table_row(cmark_mem *mem, table_row *row) {
if (!row)
return;
cmark_llist_free_full(mem, row->cells, (cmark_free_func)free_table_cell);
free_row_cells(mem, row);
mem->free(row);
}
@@ -175,6 +183,24 @@ static cmark_strbuf *unescape_pipes(cmark_mem *mem, unsigned char *string, bufsi
return res;
}
// Adds a new cell to the end of the row. A pointer to the new cell is returned
// for the caller to initialize.
static node_cell* append_row_cell(cmark_mem *mem, table_row *row) {
const uint32_t n_columns = row->n_columns + 1;
// realloc when n_columns is a power of 2
if ((n_columns & (n_columns-1)) == 0) {
// make sure we never wrap row->n_columns
// offset will != len and our exit will clean up as intended
if (n_columns > UINT16_MAX) {
return NULL;
}
// Use realloc to double the size of the buffer.
row->cells = (node_cell *)mem->realloc(row->cells, (2 * n_columns - 1) * sizeof(node_cell));
}
row->n_columns = (uint16_t)n_columns;
return &row->cells[n_columns-1];
}
static table_row *row_from_string(cmark_syntax_extension *self,
cmark_parser *parser, unsigned char *string,
int len) {
@@ -216,15 +242,22 @@ static table_row *row_from_string(cmark_syntax_extension *self,
cell_matched);
cmark_strbuf_trim(cell_buf);
node_cell *cell = (node_cell *)parser->mem->calloc(1, sizeof(*cell));
node_cell *cell = append_row_cell(parser->mem, row);
if (!cell) {
int_overflow_abort = 1;
cmark_strbuf_free(cell_buf);
parser->mem->free(cell_buf);
break;
}
cell->buf = cell_buf;
cell->start_offset = offset;
if (cell_matched > 0)
cell->end_offset = offset + cell_matched - 1;
else
cell->end_offset = offset;
cell->internal_offset = 0;
while (cell->start_offset > 0 && string[cell->start_offset - 1] != '|') {
while (cell->start_offset > row->paragraph_offset && string[cell->start_offset - 1] != '|') {
--cell->start_offset;
++cell->internal_offset;
}
@@ -237,13 +270,11 @@ static table_row *row_from_string(cmark_syntax_extension *self,
cell->cell_data->colspan = 0;
// find the last cell that isn't part of a colspan, and increment that colspan
cmark_llist *tmp = row->cells;
node_cell *colspan_cell = NULL;
while (tmp) {
node_cell *this_cell = (node_cell *)tmp->data;
for (uint16_t i = 0; i < row->n_columns; i++) {
node_cell *this_cell = &row->cells[i];
if (this_cell->cell_data->colspan > 0)
colspan_cell = this_cell;
tmp = tmp->next;
}
if (colspan_cell)
++colspan_cell->cell_data->colspan;
@@ -272,8 +303,6 @@ static table_row *row_from_string(cmark_syntax_extension *self,
int_overflow_abort = 1;
break;
}
row->n_columns += 1;
row->cells = cmark_llist_append(parser->mem, row->cells, cell);
}
offset += cell_matched + pipe_matched;
@@ -291,9 +320,7 @@ static table_row *row_from_string(cmark_syntax_extension *self,
if (row_end_offset && offset != len) {
row->paragraph_offset = offset;
cmark_llist_free_full(parser->mem, row->cells, (cmark_free_func)free_table_cell);
row->cells = NULL;
row->n_columns = 0;
free_row_cells(parser->mem, row);
// Scan past the (optional) leading pipe.
offset += scan_table_cell_end(string, len, offset);
@@ -344,6 +371,10 @@ static cmark_node *try_opening_table_header(cmark_syntax_extension *self,
const char *parent_string;
uint16_t i;
if (parent_container->flags & CMARK_NODE__TABLE_VISITED) {
return parent_container;
}
if (!scan_table_start(input, len, cmark_parser_get_first_nonspace(parser))) {
return parent_container;
}
@@ -371,6 +402,7 @@ static cmark_node *try_opening_table_header(cmark_syntax_extension *self,
free_table_row(parser->mem, marker_row);
free_table_row(parser->mem, header_row);
cmark_arena_pop();
parent_container->flags |= CMARK_NODE__TABLE_VISITED;
return parent_container;
}
@@ -407,9 +439,8 @@ static cmark_node *try_opening_table_header(cmark_syntax_extension *self,
// since we populate the alignments array based on marker_row->cells
uint8_t *alignments =
(uint8_t *)parser->mem->calloc(marker_row->n_columns, sizeof(uint8_t));
cmark_llist *it = marker_row->cells;
for (i = 0; it; it = it->next, ++i) {
node_cell *node = (node_cell *)it->data;
for (i = 0; i < marker_row->n_columns; ++i) {
node_cell *node = &marker_row->cells[i];
bool left = node->buf->ptr[0] == ':', right = node->buf->ptr[node->buf->size - 1] == ':';
if (left && right)
@@ -432,10 +463,8 @@ static cmark_node *try_opening_table_header(cmark_syntax_extension *self,
ntr->is_header = true;
{
cmark_llist *tmp;
for (tmp = header_row->cells; tmp; tmp = tmp->next) {
node_cell *cell = (node_cell *) tmp->data;
for (i = 0; i < header_row->n_columns; ++i) {
node_cell *cell = &header_row->cells[i];
cmark_node *header_cell = cmark_parser_add_child(parser, table_header,
CMARK_NODE_TABLE_CELL, parent_container->start_column + cell->start_offset);
header_cell->start_line = header_cell->end_line = parent_container->start_line;
@@ -487,11 +516,10 @@ static cmark_node *try_opening_table_row(cmark_syntax_extension *self,
if (parser->options & CMARK_OPT_TABLE_SPANS) {
// Check the new row for rowspan markers and increment the rowspan of the cell it's merging with
cmark_llist *tmp;
int i;
for (tmp = row->cells, i = 0; tmp && i < table_columns; tmp = tmp->next, ++i) {
node_cell *this_cell = (node_cell *)tmp->data;
for (i = 0; i < row->n_columns && i < table_columns; ++i) {
node_cell *this_cell = &row->cells[i];
if (this_cell->cell_data->rowspan == 0) {
// Rowspan marker. Scan up through previous rows and increment the spanning cell's rowspan
cmark_node *check_row = table_row_block->prev;
@@ -515,11 +543,10 @@ static cmark_node *try_opening_table_row(cmark_syntax_extension *self,
}
{
cmark_llist *tmp;
int i;
for (tmp = row->cells, i = 0; tmp && i < table_columns; tmp = tmp->next, ++i) {
node_cell *cell = (node_cell *) tmp->data;
for (i = 0; i < row->n_columns && i < table_columns; ++i) {
node_cell *cell = &row->cells[i];
cmark_node *node = cmark_parser_add_child(parser, table_row_block,
CMARK_NODE_TABLE_CELL, parent_container->start_column + cell->start_offset);
node->internal_offset = cell->internal_offset;
@@ -980,6 +1007,7 @@ static int escape(cmark_syntax_extension *self, cmark_node *node, int c) {
cmark_syntax_extension *create_table_extension(void) {
cmark_syntax_extension *self = cmark_syntax_extension_new("table");
cmark_register_node_flag(&CMARK_NODE__TABLE_VISITED);
cmark_syntax_extension_set_match_block_func(self, matches);
cmark_syntax_extension_set_open_block_func(self, try_opening_table_block);
cmark_syntax_extension_set_get_type_string_func(self, get_type_string);

21
fuzz/CMakeLists.txt Normal file
View File

@@ -0,0 +1,21 @@
include_directories(
${PROJECT_BINARY_DIR}/extensions
${PROJECT_BINARY_DIR}/src
../extensions
../src
)
macro(fuzzer name)
add_executable(${name} ${name}.c)
set_target_properties(${name}
PROPERTIES
COMPILE_FLAGS "-fsanitize=fuzzer"
LINK_FLAGS "-fsanitize=fuzzer")
if(CMARK_SHARED)
target_link_libraries(${name} libcmark-gfm-extensions libcmark-gfm)
elseif(CMARK_STATIC)
target_link_libraries(${name} libcmark-gfm-extensions_static libcmark-gfm_static)
endif()
endmacro()
fuzzer(fuzz_quadratic)

12
fuzz/README.md Normal file
View File

@@ -0,0 +1,12 @@
The quadratic fuzzer generates long sequences of repeated characters, such as `<?x<?x<?x<?x...`,
to detect quadratic complexity performance issues.
To build and run the quadratic fuzzer:
```bash
mkdir build-fuzz
cd build-fuzz
cmake -DCMARK_FUZZ_QUADRATIC=ON -DCMAKE_C_COMPILER=$(which clang) -DCMAKE_CXX_COMPILER=$(which clang++) -DCMAKE_BUILD_TYPE=Release ..
make
../fuzz/fuzzloop.sh
```

87
fuzz/fuzz_quadratic.c Normal file
View File

@@ -0,0 +1,87 @@
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include "cmark-gfm.h"
#include "cmark-gfm-core-extensions.h"
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
const char *extension_names[] = {
"autolink",
"strikethrough",
"table",
"tagfilter",
NULL,
};
int LLVMFuzzerInitialize(int *argc, char ***argv) {
cmark_init_standard_node_flags();
cmark_gfm_core_extensions_ensure_registered();
return 0;
}
int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) {
struct __attribute__((packed)) {
int options;
int width;
uint8_t splitpoint;
uint8_t repeatlen;
} fuzz_config;
if (size >= sizeof(fuzz_config)) {
/* The beginning of `data` is treated as fuzzer configuration */
memcpy(&fuzz_config, data, sizeof(fuzz_config));
/* Test options that are used by GitHub. */
fuzz_config.options = CMARK_OPT_UNSAFE | CMARK_OPT_FOOTNOTES | CMARK_OPT_GITHUB_PRE_LANG | CMARK_OPT_HARDBREAKS;
/* Remainder of input is the markdown */
const char *markdown0 = (const char *)(data + sizeof(fuzz_config));
const size_t markdown_size0 = size - sizeof(fuzz_config);
char markdown[0x80000];
if (markdown_size0 <= sizeof(markdown)) {
size_t markdown_size = 0;
if (fuzz_config.splitpoint <= markdown_size0 && 0 < fuzz_config.repeatlen &&
fuzz_config.repeatlen <= markdown_size0 - fuzz_config.splitpoint) {
const size_t size_after_splitpoint = markdown_size0 - fuzz_config.splitpoint - fuzz_config.repeatlen;
memcpy(&markdown[markdown_size], &markdown0[0], fuzz_config.splitpoint);
markdown_size += fuzz_config.splitpoint;
while (markdown_size + fuzz_config.repeatlen + size_after_splitpoint <= sizeof(markdown)) {
memcpy(&markdown[markdown_size], &markdown0[fuzz_config.splitpoint],
fuzz_config.repeatlen);
markdown_size += fuzz_config.repeatlen;
}
memcpy(&markdown[markdown_size], &markdown0[fuzz_config.splitpoint + fuzz_config.repeatlen],
size_after_splitpoint);
markdown_size += size_after_splitpoint;
} else {
markdown_size = markdown_size0;
memcpy(markdown, markdown0, markdown_size);
}
cmark_parser *parser = cmark_parser_new(fuzz_config.options);
for (const char **it = extension_names; *it; ++it) {
const char *extension_name = *it;
cmark_syntax_extension *syntax_extension = cmark_find_syntax_extension(extension_name);
if (!syntax_extension) {
fprintf(stderr, "%s is not a valid syntax extension\n", extension_name);
abort();
}
cmark_parser_attach_syntax_extension(parser, syntax_extension);
}
cmark_parser_feed(parser, markdown, markdown_size);
cmark_node *doc = cmark_parser_finish(parser);
free(cmark_render_html(doc, fuzz_config.options, NULL));
cmark_node_free(doc);
cmark_parser_free(parser);
}
}
return 0;
}

28
fuzz/fuzzloop.sh Executable file
View File

@@ -0,0 +1,28 @@
#!/bin/bash
# Stop when an error is found
set -e
# Create a corpus sub-directory if it doesn't already exist.
mkdir -p corpus
# The memory and disk usage grows over time, so this loop restarts the
# fuzzer every 4 hours. The `-merge=1` option is used to minimize the
# corpus on each iteration.
while :
do
date
echo restarting loop
# Minimize the corpus
mv corpus/ corpus2
mkdir corpus
echo minimizing corpus
./fuzz/fuzz_quadratic -merge=1 corpus ../bench corpus2/ -max_len=1024
rm -r corpus2
# Run the fuzzer for 4 hours
date
echo start fuzzer
./fuzz/fuzz_quadratic corpus -dict=../test/fuzzing_dictionary -jobs=$(nproc) -workers=$(nproc) -max_len=1024 -max_total_time=14400
done

View File

@@ -84,19 +84,17 @@ static void *arena_calloc(size_t nmem, size_t size) {
CMARK_INITIALIZE_AND_LOCK(arena);
void *ptr = NULL;
struct arena_chunk *chunk;
if (sz > A->sz) {
A->prev = alloc_arena_chunk(sz, A->prev);
ptr = (uint8_t *) A->prev->ptr;
A->prev = chunk = alloc_arena_chunk(sz, A->prev);
} else if (sz > A->sz - A->used) {
A = chunk = alloc_arena_chunk(A->sz + A->sz / 2, A);
} else {
if (sz > A->sz - A->used) {
A = alloc_arena_chunk(A->sz + A->sz / 2, A);
}
ptr = (uint8_t *) A->ptr + A->used;
A->used += sz;
*((size_t *) ptr) = sz - sizeof(size_t);
chunk = A;
}
void *ptr = (uint8_t *) chunk->ptr + chunk->used;
chunk->used += sz;
*((size_t *) ptr) = sz - sizeof(size_t);
CMARK_UNLOCK(arena);

View File

@@ -8,6 +8,7 @@
#include <stdlib.h>
#include <assert.h>
#include <stdio.h>
#include <limits.h>
#include "cmark_ctype.h"
#include "syntax_extension.h"
@@ -665,6 +666,14 @@ static cmark_node *finalize_document(cmark_parser *parser) {
}
finalize(parser, parser->root);
// Limit total size of extra content created from reference links to
// document size to avoid superlinear growth. Always allow 100KB.
if (parser->total_size > 100000)
parser->refmap->max_ref_size = parser->total_size;
else
parser->refmap->max_ref_size = 100000;
process_inlines(parser, parser->refmap, parser->options);
if (parser->options & CMARK_OPT_FOOTNOTES)
process_footnotes(parser);
@@ -725,6 +734,11 @@ static void S_parser_feed(cmark_parser *parser, const unsigned char *buffer,
static const uint8_t repl[] = {239, 191, 189};
bool preserveWhitespace = parser->options & CMARK_OPT_PRESERVE_WHITESPACE;
if (len > UINT_MAX - parser->total_size)
parser->total_size = UINT_MAX;
else
parser->total_size += len;
if (parser->last_buffer_ended_with_cr && *buffer == '\n') {
// skip NL if last buffer ended with CR ; see #117
buffer++;

View File

@@ -114,6 +114,7 @@ typedef struct delimiter {
struct delimiter *previous;
struct delimiter *next;
cmark_node *inl_text;
bufsize_t position;
bufsize_t length;
unsigned char delim_char;
int can_open;

View File

@@ -10,7 +10,8 @@ extern "C" {
struct cmark_map_entry {
struct cmark_map_entry *next;
unsigned char *label;
unsigned int age;
size_t age;
size_t size;
};
typedef struct cmark_map_entry cmark_map_entry;
@@ -23,7 +24,9 @@ struct cmark_map {
cmark_mem *mem;
cmark_map_entry *refs;
cmark_map_entry **sorted;
unsigned int size;
size_t size;
size_t ref_size;
size_t max_ref_size;
cmark_map_free_f free;
};

View File

@@ -52,11 +52,7 @@ typedef struct {
cmark_chunk on_exit;
} cmark_custom;
enum cmark_node__internal_flags {
CMARK_NODE__OPEN = (1 << 0),
CMARK_NODE__LAST_LINE_BLANK = (1 << 1),
CMARK_NODE__LAST_LINE_CHECKED = (1 << 2),
};
typedef uint16_t cmark_node__internal_flags;
struct cmark_node {
cmark_strbuf content;
@@ -76,7 +72,7 @@ struct cmark_node {
int end_column;
int internal_offset;
uint16_t type;
uint16_t flags;
cmark_node__internal_flags flags;
int backtick_count;
cmark_syntax_extension *extension;
@@ -101,6 +97,30 @@ struct cmark_node {
} as;
};
/**
* Syntax extensions can use this function to register a custom node
* flag. The flags are stored in the `flags` field of the `cmark_node`
* struct. The `flags` parameter should be the address of a global variable
* which will store the flag value.
*/
CMARK_GFM_EXPORT
void cmark_register_node_flag(cmark_node__internal_flags *flags);
/**
* Standard node flags. (Initialized using `cmark_init_standard_node_flags`.)
*/
extern cmark_node__internal_flags CMARK_NODE__OPEN;
extern cmark_node__internal_flags CMARK_NODE__LAST_LINE_BLANK;
extern cmark_node__internal_flags CMARK_NODE__LAST_LINE_CHECKED;
/**
* Uses `cmark_register_node_flag` to initialize the standard node flags.
* This function should be called at program startup time. Calling it
* multiple times has no additional effect.
*/
CMARK_GFM_EXPORT
void cmark_init_standard_node_flags();
static CMARK_INLINE cmark_mem *cmark_node_mem(cmark_node *node) {
return node->content.mem;
}

View File

@@ -47,6 +47,7 @@ struct cmark_parser {
/* Options set by the user, see the Options section in cmark.h */
int options;
bool last_buffer_ended_with_cr;
size_t total_size;
cmark_llist *syntax_extensions;
cmark_llist *inline_syntax_extensions;
cmark_ispunct_func backslash_ispunct;

View File

@@ -15,6 +15,10 @@ bufsize_t _scan_autolink_uri(const unsigned char *p);
bufsize_t _scan_autolink_email(const unsigned char *p);
bufsize_t _scan_html_tag(const unsigned char *p);
bufsize_t _scan_liberal_html_tag(const unsigned char *p);
bufsize_t _scan_html_comment(const unsigned char *p);
bufsize_t _scan_html_pi(const unsigned char *p);
bufsize_t _scan_html_declaration(const unsigned char *p);
bufsize_t _scan_html_cdata(const unsigned char *p);
bufsize_t _scan_html_block_start(const unsigned char *p);
bufsize_t _scan_html_block_start_7(const unsigned char *p);
bufsize_t _scan_html_block_end_1(const unsigned char *p);
@@ -37,6 +41,10 @@ bufsize_t _scan_footnote_definition(const unsigned char *p);
#define scan_autolink_email(c, n) _scan_at(&_scan_autolink_email, c, n)
#define scan_html_tag(c, n) _scan_at(&_scan_html_tag, c, n)
#define scan_liberal_html_tag(c, n) _scan_at(&_scan_liberal_html_tag, c, n)
#define scan_html_comment(c, n) _scan_at(&_scan_html_comment, c, n)
#define scan_html_pi(c, n) _scan_at(&_scan_html_pi, c, n)
#define scan_html_declaration(c, n) _scan_at(&_scan_html_declaration, c, n)
#define scan_html_cdata(c, n) _scan_at(&_scan_html_cdata, c, n)
#define scan_html_block_start(c, n) _scan_at(&_scan_html_block_start, c, n)
#define scan_html_block_start_7(c, n) _scan_at(&_scan_html_block_start_7, c, n)
#define scan_html_block_end_1(c, n) _scan_at(&_scan_html_block_end_1, c, n)

View File

@@ -41,7 +41,6 @@ typedef enum {
typedef struct bracket {
struct bracket *previous;
struct delimiter *previous_delimiter;
cmark_node *inl_text;
bufsize_t position;
bracket_type type;
@@ -50,9 +49,15 @@ typedef struct bracket {
bool in_bracket[4];
} bracket;
#define FLAG_SKIP_HTML_CDATA (1u << 0)
#define FLAG_SKIP_HTML_DECLARATION (1u << 1)
#define FLAG_SKIP_HTML_PI (1u << 2)
#define FLAG_SKIP_HTML_COMMENT (1u << 3)
typedef struct subject{
cmark_mem *mem;
cmark_chunk input;
unsigned flags;
int line;
bufsize_t pos;
int block_offset;
@@ -62,6 +67,7 @@ typedef struct subject{
bracket *last_bracket;
bufsize_t backticks[MAXBACKTICKS + 1];
bool scanned_for_backticks;
bool no_link_openers;
} subject;
void cmark_set_default_skip_chars(int8_t **skip_chars, bool use_memcpy) {
@@ -122,6 +128,24 @@ static cmark_node *make_str_with_entities(subject *subj,
}
}
// Like cmark_node_append_child but without costly sanity checks.
// Assumes that child was newly created.
static void append_child(cmark_node *node, cmark_node *child) {
cmark_node *old_last_child = node->last_child;
child->next = NULL;
child->prev = old_last_child;
child->parent = node;
node->last_child = child;
if (old_last_child) {
old_last_child->next = child;
} else {
// Also set first_child if node previously had no children.
node->first_child = child;
}
}
// Duplicate a chunk by creating a copy of the buffer not by reusing the
// buffer like cmark_chunk_dup does.
static cmark_chunk chunk_clone(cmark_mem *mem, cmark_chunk *src) {
@@ -165,7 +189,7 @@ static CMARK_INLINE cmark_node *make_autolink(subject *subj,
link->start_line = link->end_line = subj->line;
link->start_column = start_column + 1 + subj->column_offset + subj->block_offset;
link->end_column = end_column + 1 + subj->column_offset + subj->block_offset;
cmark_node_append_child(link, make_str_with_entities(subj, start_column + 1, end_column - 1, &url));
append_child(link, make_str_with_entities(subj, start_column + 1, end_column - 1, &url));
return link;
}
@@ -174,6 +198,7 @@ static void subject_from_buf(cmark_mem *mem, int line_number, int block_offset,
int i;
e->mem = mem;
e->input = *chunk;
e->flags = 0;
e->line = line_number;
e->pos = 0;
e->block_offset = block_offset;
@@ -185,6 +210,7 @@ static void subject_from_buf(cmark_mem *mem, int line_number, int block_offset,
e->backticks[i] = 0;
}
e->scanned_for_backticks = false;
e->no_link_openers = true;
}
static CMARK_INLINE int isbacktick(int c) { return (c == '`'); }
@@ -520,6 +546,7 @@ static void push_delimiter(subject *subj, unsigned char c, bool can_open,
delim->can_open = can_open;
delim->can_close = can_close;
delim->inl_text = inl_text;
delim->position = subj->pos;
delim->length = inl_text->as.literal.len;
delim->previous = subj->last_delim;
delim->next = NULL;
@@ -539,11 +566,13 @@ static void push_bracket(subject *subj, bracket_type type, cmark_node *inl_text)
b->active = true;
b->inl_text = inl_text;
b->previous = subj->last_bracket;
b->previous_delimiter = subj->last_delim;
b->position = subj->pos;
b->bracket_after = false;
b->in_bracket[type] = true;
subj->last_bracket = b;
if (type != IMAGE) {
subj->no_link_openers = false;
}
}
// Assumes the subject has a c at the current position.
@@ -650,12 +679,13 @@ static cmark_syntax_extension *get_extension_for_special_char(cmark_parser *pars
return NULL;
}
static void process_emphasis(cmark_parser *parser, subject *subj, delimiter *stack_bottom) {
delimiter *closer = subj->last_delim;
static void process_emphasis(cmark_parser *parser, subject *subj, bufsize_t stack_bottom) {
delimiter *candidate;
delimiter *closer = NULL;
delimiter *opener;
delimiter *old_closer;
bool opener_found;
delimiter *openers_bottom[3][128];
bufsize_t openers_bottom[3][128];
int i;
// initialize openers_bottom:
@@ -668,8 +698,10 @@ static void process_emphasis(cmark_parser *parser, subject *subj, delimiter *sta
}
// move back to first relevant delim.
while (closer != NULL && closer->previous != stack_bottom) {
closer = closer->previous;
candidate = subj->last_delim;
while (candidate != NULL && candidate->position >= stack_bottom) {
closer = candidate;
candidate = candidate->previous;
}
// now move forward, looking for closers, and handling each
@@ -679,8 +711,8 @@ static void process_emphasis(cmark_parser *parser, subject *subj, delimiter *sta
// Now look backwards for first matching opener:
opener = closer->previous;
opener_found = false;
while (opener != NULL && opener != stack_bottom &&
opener != openers_bottom[closer->length % 3][closer->delim_char]) {
while (opener != NULL && opener->position >= stack_bottom &&
opener->position >= openers_bottom[closer->length % 3][closer->delim_char]) {
if (opener->can_open && opener->delim_char == closer->delim_char) {
// interior closer of size 2 can't match opener of size 1
// or of size 1 can't match 2
@@ -706,27 +738,29 @@ static void process_emphasis(cmark_parser *parser, subject *subj, delimiter *sta
} else {
closer = closer->next;
}
} else if (closer->delim_char == '\'') {
} else if (closer->delim_char == '\'' || closer->delim_char == '"') {
cmark_chunk_free(subj->mem, &closer->inl_text->as.literal);
closer->inl_text->as.literal = cmark_chunk_literal(RIGHTSINGLEQUOTE);
if (opener_found) {
cmark_chunk_free(subj->mem, &opener->inl_text->as.literal);
opener->inl_text->as.literal = cmark_chunk_literal(LEFTSINGLEQUOTE);
if (closer->delim_char == '\'') {
closer->inl_text->as.literal = cmark_chunk_literal(RIGHTSINGLEQUOTE);
} else {
closer->inl_text->as.literal = cmark_chunk_literal(RIGHTDOUBLEQUOTE);
}
closer = closer->next;
} else if (closer->delim_char == '"') {
cmark_chunk_free(subj->mem, &closer->inl_text->as.literal);
closer->inl_text->as.literal = cmark_chunk_literal(RIGHTDOUBLEQUOTE);
if (opener_found) {
cmark_chunk_free(subj->mem, &opener->inl_text->as.literal);
opener->inl_text->as.literal = cmark_chunk_literal(LEFTDOUBLEQUOTE);
if (old_closer->delim_char == '\'') {
opener->inl_text->as.literal = cmark_chunk_literal(LEFTSINGLEQUOTE);
} else {
opener->inl_text->as.literal = cmark_chunk_literal(LEFTDOUBLEQUOTE);
}
remove_delimiter(subj, opener);
remove_delimiter(subj, old_closer);
}
closer = closer->next;
}
if (!opener_found) {
// set lower bound for future searches for openers
openers_bottom[old_closer->length % 3][old_closer->delim_char] =
old_closer->previous;
old_closer->position;
if (!old_closer->can_open) {
// we can remove a closer that can't be an
// opener, once we've seen there's no
@@ -739,7 +773,8 @@ static void process_emphasis(cmark_parser *parser, subject *subj, delimiter *sta
}
}
// free all delimiters in list until stack_bottom:
while (subj->last_delim != NULL && subj->last_delim != stack_bottom) {
while (subj->last_delim != NULL &&
subj->last_delim->position >= stack_bottom) {
remove_delimiter(subj, subj->last_delim);
}
}
@@ -778,7 +813,8 @@ static delimiter *S_insert_emph(subject *subj, delimiter *opener,
tmp = opener_inl->next;
while (tmp && tmp != closer_inl) {
tmpnext = tmp->next;
cmark_node_append_child(emph, tmp);
cmark_node_unlink(tmp);
append_child(emph, tmp);
tmp = tmpnext;
}
cmark_node_insert_after(opener_inl, emph);
@@ -915,7 +951,63 @@ static cmark_node *handle_pointy_brace(subject *subj, int options) {
}
// finally, try to match an html tag
matchlen = scan_html_tag(&subj->input, subj->pos);
if (subj->pos + 2 <= subj->input.len) {
int c = subj->input.data[subj->pos];
if (c == '!' && (subj->flags & FLAG_SKIP_HTML_COMMENT) == 0) {
c = subj->input.data[subj->pos+1];
if (c == '-' && subj->input.data[subj->pos+2] == '-') {
if (subj->input.data[subj->pos+3] == '>') {
matchlen = 4;
} else if (subj->input.data[subj->pos+3] == '-' &&
subj->input.data[subj->pos+4] == '>') {
matchlen = 5;
} else {
matchlen = scan_html_comment(&subj->input, subj->pos + 1);
if (matchlen > 0) {
matchlen += 1; // prefix "<"
} else { // no match through end of input: set a flag so
// we don't reparse looking for -->:
subj->flags |= FLAG_SKIP_HTML_COMMENT;
}
}
} else if (c == '[') {
if ((subj->flags & FLAG_SKIP_HTML_CDATA) == 0) {
matchlen = scan_html_cdata(&subj->input, subj->pos + 2);
if (matchlen > 0) {
// The regex doesn't require the final "]]>". But if we're not at
// the end of input, it must come after the match. Otherwise,
// disable subsequent scans to avoid quadratic behavior.
matchlen += 5; // prefix "![", suffix "]]>"
if (subj->pos + matchlen > subj->input.len) {
subj->flags |= FLAG_SKIP_HTML_CDATA;
matchlen = 0;
}
}
}
} else if ((subj->flags & FLAG_SKIP_HTML_DECLARATION) == 0) {
matchlen = scan_html_declaration(&subj->input, subj->pos + 1);
if (matchlen > 0) {
matchlen += 2; // prefix "!", suffix ">"
if (subj->pos + matchlen > subj->input.len) {
subj->flags |= FLAG_SKIP_HTML_DECLARATION;
matchlen = 0;
}
}
}
} else if (c == '?') {
if ((subj->flags & FLAG_SKIP_HTML_PI) == 0) {
// Note that we allow an empty match.
matchlen = scan_html_pi(&subj->input, subj->pos + 1);
matchlen += 3; // prefix "?", suffix "?>"
if (subj->pos + matchlen > subj->input.len) {
subj->flags |= FLAG_SKIP_HTML_PI;
matchlen = 0;
}
}
} else {
matchlen = scan_html_tag(&subj->input, subj->pos);
}
}
if (matchlen > 0) {
contents = cmark_chunk_dup(&subj->input, subj->pos - 1, matchlen + 1);
subj->pos += matchlen;
@@ -1170,7 +1262,7 @@ static cmark_node *handle_close_bracket_attribute(cmark_parser *parser, subject
// Free the bracket ^[:
cmark_node_free(opener->inl_text);
process_emphasis(parser, subj, opener->previous_delimiter);
process_emphasis(parser, subj, opener->position);
pop_bracket(subj);
return NULL;
@@ -1201,12 +1293,6 @@ static cmark_node *handle_close_bracket(cmark_parser *parser, subject *subj) {
return make_str(subj, subj->pos - 1, subj->pos - 1, cmark_chunk_literal("]"));
}
if (!opener->active) {
// take delimiter off stack
pop_bracket(subj);
return make_str(subj, subj->pos - 1, subj->pos - 1, cmark_chunk_literal("]"));
}
if (opener->type == ATTRIBUTE) {
return handle_close_bracket_attribute(parser, subj, opener);
}
@@ -1215,6 +1301,12 @@ static cmark_node *handle_close_bracket(cmark_parser *parser, subject *subj) {
// Now we check to see if it's a link/image.
is_image = opener->type == IMAGE;
if (!is_image && subj->no_link_openers) {
// take delimiter off stack
pop_bracket(subj);
return make_str(subj, subj->pos - 1, subj->pos - 1, cmark_chunk_literal("]"));
}
after_link_text_pos = subj->pos;
// First, look for an inline link.
@@ -1333,7 +1425,7 @@ noMatch:
// being replacing the opening '[' text node with a `^footnote-ref]` node.
cmark_node_insert_before(opener->inl_text, fnref);
process_emphasis(parser, subj, opener->previous_delimiter);
process_emphasis(parser, subj, opener->position);
// sometimes, the footnote reference text gets parsed into multiple nodes
// i.e. '[^example]' parsed into '[', '^exam', 'ple]'.
// this happens for ex with the autolink extension. when the autolinker
@@ -1379,42 +1471,22 @@ match:
tmp = opener->inl_text->next;
while (tmp) {
tmpnext = tmp->next;
cmark_node_append_child(inl, tmp);
cmark_node_unlink(tmp);
append_child(inl, tmp);
tmp = tmpnext;
}
// Free the bracket [:
cmark_node_free(opener->inl_text);
process_emphasis(parser, subj, opener->previous_delimiter);
process_emphasis(parser, subj, opener->position);
pop_bracket(subj);
// Now, if we have a link, we also want to deactivate earlier link
// delimiters. (This code can be removed if we decide to allow links
// Now, if we have a link, we also want to deactivate links until
// we get a new opener. (This code can be removed if we decide to allow links
// inside links.)
if (!is_image) {
opener = subj->last_bracket;
while (opener != NULL) {
if (opener->type == LINK) {
if (!opener->active) {
break;
} else {
opener->active = false;
}
}
opener = opener->previous;
}
bool in_image = false;
if (opener) {
in_image = opener->in_bracket[IMAGE];
}
bracket *opener2 = subj->last_bracket;
while (opener2 != opener) {
if (opener2->type == IMAGE) {
opener2->in_bracket[IMAGE] = in_image;
}
opener2 = opener2->previous;
}
subj->no_link_openers = true;
}
return NULL;
@@ -1623,7 +1695,7 @@ static int parse_inline(cmark_parser *parser, subject *subj, cmark_node *parent,
}
if (new_inl != NULL) {
cmark_node_append_child(parent, new_inl);
append_child(parent, new_inl);
}
return 1;
@@ -1643,7 +1715,7 @@ void cmark_parse_inlines(cmark_parser *parser,
while (!is_eof(&subj) && parse_inline(parser, &subj, parent, options))
;
process_emphasis(parser, &subj, NULL);
process_emphasis(parser, &subj, 0);
// free bracket and delim stack
while (subj.last_delim) {
remove_delimiter(&subj, subj.last_delim);

View File

@@ -51,7 +51,7 @@ refsearch(const void *label, const void *p2) {
}
static void sort_map(cmark_map *map) {
unsigned int i = 0, last = 0, size = map->size;
size_t i = 0, last = 0, size = map->size;
cmark_map_entry *r = map->refs, **sorted = NULL;
sorted = (cmark_map_entry **)map->mem->calloc(size, sizeof(cmark_map_entry *));
@@ -73,6 +73,7 @@ static void sort_map(cmark_map *map) {
cmark_map_entry *cmark_map_lookup(cmark_map *map, cmark_chunk *label) {
cmark_map_entry **ref = NULL;
cmark_map_entry *r = NULL;
unsigned char *norm;
if (label->len < 1 || label->len > MAX_LINK_LABEL_LENGTH)
@@ -91,10 +92,15 @@ cmark_map_entry *cmark_map_lookup(cmark_map *map, cmark_chunk *label) {
ref = (cmark_map_entry **)bsearch(norm, map->sorted, map->size, sizeof(cmark_map_entry *), refsearch);
map->mem->free(norm);
if (!ref)
return NULL;
if (ref != NULL) {
r = ref[0];
/* Check for expansion limit */
if (r->size > map->max_ref_size - map->ref_size)
return NULL;
map->ref_size += r->size;
}
return ref[0];
return r;
}
void cmark_map_free(cmark_map *map) {
@@ -118,5 +124,6 @@ cmark_map *cmark_map_new(cmark_mem *mem, cmark_map_free_f free) {
cmark_map *map = (cmark_map *)mem->calloc(1, sizeof(cmark_map));
map->mem = mem;
map->free = free;
map->max_ref_size = UINT_MAX;
return map;
}

View File

@@ -9,6 +9,40 @@ static void S_node_unlink(cmark_node *node);
#define NODE_MEM(node) cmark_node_mem(node)
cmark_node__internal_flags CMARK_NODE__OPEN;
cmark_node__internal_flags CMARK_NODE__LAST_LINE_BLANK;
cmark_node__internal_flags CMARK_NODE__LAST_LINE_CHECKED;
void cmark_register_node_flag(cmark_node__internal_flags *flags) {
static uint8_t shift = 0;
// flags should be a pointer to a global variable and this function
// should only be called once to initialize its value.
if (*flags) {
fprintf(stderr, "flag initialization error in cmark_register_node_flag\n");
abort();
}
// Check that we haven't run out of bits.
if (shift >= 8 * sizeof(cmark_node__internal_flags)) {
fprintf(stderr, "too many flags in cmark_register_node_flag\n");
abort();
}
*flags = (cmark_node__internal_flags)1 << shift;
shift++;
}
void cmark_init_standard_node_flags() {
static int initialized = 0;
if (!initialized) {
initialized = 1;
cmark_register_node_flag(&CMARK_NODE__OPEN);
cmark_register_node_flag(&CMARK_NODE__LAST_LINE_BLANK);
cmark_register_node_flag(&CMARK_NODE__LAST_LINE_CHECKED);
}
}
bool cmark_node_can_contain_type(cmark_node *node, cmark_node_type child_type) {
if (child_type == CMARK_NODE_DOCUMENT) {
return false;

View File

@@ -35,6 +35,7 @@ void cmark_reference_create(cmark_map *map, cmark_chunk *label,
ref->attributes = cmark_chunk_literal("");
ref->entry.age = map->size;
ref->entry.next = map->refs;
ref->entry.size = ref->url.len + ref->title.len;
map->refs = (cmark_map_entry *)ref;
map->size++;

File diff suppressed because it is too large Load Diff

View File

@@ -37,7 +37,7 @@ bufsize_t _scan_at(bufsize_t (*scanner)(const unsigned char *), cmark_chunk *c,
tagname = [A-Za-z][A-Za-z0-9-]*;
blocktagname = 'address'|'article'|'aside'|'base'|'basefont'|'blockquote'|'body'|'caption'|'center'|'col'|'colgroup'|'dd'|'details'|'dialog'|'dir'|'div'|'dl'|'dt'|'fieldset'|'figcaption'|'figure'|'footer'|'form'|'frame'|'frameset'|'h1'|'h2'|'h3'|'h4'|'h5'|'h6'|'head'|'header'|'hr'|'html'|'iframe'|'legend'|'li'|'link'|'main'|'menu'|'menuitem'|'nav'|'noframes'|'ol'|'optgroup'|'option'|'p'|'param'|'section'|'title'|'summary'|'table'|'tbody'|'td'|'tfoot'|'th'|'thead'|'title'|'tr'|'track'|'ul';
blocktagname = 'address'|'article'|'aside'|'base'|'basefont'|'blockquote'|'body'|'caption'|'center'|'col'|'colgroup'|'dd'|'details'|'dialog'|'dir'|'div'|'dl'|'dt'|'fieldset'|'figcaption'|'figure'|'footer'|'form'|'frame'|'frameset'|'h1'|'h2'|'h3'|'h4'|'h5'|'h6'|'head'|'header'|'hr'|'html'|'iframe'|'legend'|'li'|'link'|'main'|'menu'|'menuitem'|'nav'|'noframes'|'ol'|'optgroup'|'option'|'p'|'param'|'section'|'source'|'title'|'summary'|'table'|'tbody'|'td'|'tfoot'|'th'|'thead'|'title'|'tr'|'track'|'ul';
attributename = [a-zA-Z_:][a-zA-Z0-9:._-]*;
@@ -54,16 +54,15 @@ bufsize_t _scan_at(bufsize_t (*scanner)(const unsigned char *), cmark_chunk *c,
opentag = tagname attribute* spacechar* [/]? [>];
closetag = [/] tagname spacechar* [>];
htmlcomment = "!---->" | ("!--" ([-]? [^\x00>-]) ([-]? [^\x00-])* "-->");
htmlcomment = "--" ([^\x00-]+ | "-" [^\x00-] | "--" [^\x00>])* "-->";
processinginstruction = "?" ([^?>\x00]+ | [?][^>\x00] | [>])* "?>";
processinginstruction = ([^?>\x00]+ | [?][^>\x00] | [>])+;
declaration = "!" [A-Z]+ spacechar+ [^>\x00]* ">";
declaration = [A-Z]+ spacechar+ [^>\x00]*;
cdata = "![CDATA[" ([^\]\x00]+ | "]" [^\]\x00] | "]]" [^>\x00])* "]]>";
cdata = "CDATA[" ([^\]\x00]+ | "]" [^\]\x00] | "]]" [^>\x00])*;
htmltag = opentag | closetag | htmlcomment | processinginstruction |
declaration | cdata;
htmltag = opentag | closetag;
in_parens_nosp = [(] (reg_char|escaped_char|[\\])* [)];
@@ -133,6 +132,46 @@ bufsize_t _scan_liberal_html_tag(const unsigned char *p)
*/
}
bufsize_t _scan_html_comment(const unsigned char *p)
{
const unsigned char *marker = NULL;
const unsigned char *start = p;
/*!re2c
htmlcomment { return (bufsize_t)(p - start); }
* { return 0; }
*/
}
bufsize_t _scan_html_pi(const unsigned char *p)
{
const unsigned char *marker = NULL;
const unsigned char *start = p;
/*!re2c
processinginstruction { return (bufsize_t)(p - start); }
* { return 0; }
*/
}
bufsize_t _scan_html_declaration(const unsigned char *p)
{
const unsigned char *marker = NULL;
const unsigned char *start = p;
/*!re2c
declaration { return (bufsize_t)(p - start); }
* { return 0; }
*/
}
bufsize_t _scan_html_cdata(const unsigned char *p)
{
const unsigned char *marker = NULL;
const unsigned char *start = p;
/*!re2c
cdata { return (bufsize_t)(p - start); }
* { return 0; }
*/
}
// Try to match an HTML block tag start line, returning
// an integer code for the type of block (1-6, matching the spec).
// #7 is handled by a separate function, below.
@@ -140,7 +179,7 @@ bufsize_t _scan_html_block_start(const unsigned char *p)
{
const unsigned char *marker = NULL;
/*!re2c
[<] ('script'|'pre'|'style') (spacechar | [>]) { return 1; }
[<] ('script'|'pre'|'textarea'|'style') (spacechar | [>]) { return 1; }
'<!--' { return 2; }
'<?' { return 3; }
'<!' [A-Z] { return 4; }
@@ -167,7 +206,7 @@ bufsize_t _scan_html_block_end_1(const unsigned char *p)
const unsigned char *marker = NULL;
const unsigned char *start = p;
/*!re2c
[^\n\x00]* [<] [/] ('script'|'pre'|'style') [>] { return (bufsize_t)(p - start); }
[^\n\x00]* [<] [/] ('script'|'pre'|'textarea'|'style') [>] { return (bufsize_t)(p - start); }
* { return 0; }
*/
}

View File

@@ -13,6 +13,7 @@ def pipe_through_prog(prog, text):
def parse(lib, extlib, text, extensions):
cmark_gfm_core_extensions_ensure_registered = extlib.cmark_gfm_core_extensions_ensure_registered
cmark_init_standard_node_flags = lib.cmark_init_standard_node_flags
find_syntax_extension = lib.cmark_find_syntax_extension
find_syntax_extension.restype = c_void_p
@@ -32,6 +33,7 @@ def parse(lib, extlib, text, extensions):
parser_finish.restype = c_void_p
parser_finish.argtypes = [c_void_p]
cmark_init_standard_node_flags()
cmark_gfm_core_extensions_ensure_registered()
parser = parser_new(0)

View File

@@ -581,6 +581,12 @@ www.github.com www.github.com/á
www.google.com/a_b
Underscores not allowed in host name www.xxx.yyy._zzz
Underscores not allowed in host name www.xxx._yyy.zzz
Underscores allowed in domain name www._xxx.yyy.zzz
**Autolink and http://inlines**
![http://inline.com/image](http://inline.com/image)
@@ -618,6 +624,9 @@ http://🍄.ga/ http://x🍄.ga/
<p>Email me at:<a href="mailto:scyther@pokemon.com">scyther@pokemon.com</a></p>
<p><a href="http://www.github.com">www.github.com</a> <a href="http://www.github.com/%C3%A1">www.github.com/á</a></p>
<p><a href="http://www.google.com/a_b">www.google.com/a_b</a></p>
<p>Underscores not allowed in host name www.xxx.yyy._zzz</p>
<p>Underscores not allowed in host name www.xxx._yyy.zzz</p>
<p>Underscores allowed in domain name <a href="http://www._xxx.yyy.zzz">www._xxx.yyy.zzz</a></p>
<p><strong>Autolink and <a href="http://inlines">http://inlines</a></strong></p>
<p><img src="http://inline.com/image" alt="http://inline.com/image" /></p>
<p><a href="mailto:a.w@b.c">a.w@b.c</a></p>

View File

@@ -63,6 +63,9 @@ pathological = {
"pattern [ (]( repeated":
(("[ (](" * 80000),
re.compile("(\[ \(\]\(){80000}")),
"pattern ![[]() repeated":
("![[]()" * 160000,
re.compile("(!\[<a href=\"\"></a>){160000}")),
"hard link/emph case":
("**x [a*b**c*](d)",
re.compile("\\*\\*x <a href=\"d\">a<em>b\\*\\*c</em></a>")),
@@ -87,6 +90,9 @@ pathological = {
"unclosed links B":
("[a](b" * 30000,
re.compile("(\[a\]\(b){30000}")),
"unclosed <!--":
("</" + "<!--" * 300000,
re.compile("\&lt;\/(\&lt;!--){300000}")),
"tables":
("aaa\rbbb\n-\v\n" * 30000,
re.compile("^<p>aaa</p>\n<table>\n<thead>\n<tr>\n<th>bbb</th>\n</tr>\n</thead>\n<tbody>\n(<tr>\n<td>aaa</td>\n</tr>\n<tr>\n<td>bbb</td>\n</tr>\n<tr>\n<td>-\x0b</td>\n</tr>\n){29999}</tbody>\n</table>\n$")),

View File

@@ -366,3 +366,11 @@ Hello world
.
<p>Hello world</p>
````````````````````````````````
Issue #424 - emphasis before links
```````````````````````````````` example
*text* [link](#section)
.
<p><em>text</em> <a href="#section">link</a></p>
````````````````````````````````

View File

@@ -130,7 +130,7 @@ questions it does not answer:
not require that. This is hardly a "corner case," and divergences
between implementations on this issue often lead to surprises for
users in real documents. (See [this comment by John
Gruber](http://article.gmane.org/gmane.text.markdown.general/1997).)
Gruber](https://web.archive.org/web/20170611172104/http://article.gmane.org/gmane.text.markdown.general/1997).)
2. Is a blank line needed before a block quote or heading?
Most implementations do not require the blank line. However,
@@ -138,7 +138,7 @@ questions it does not answer:
also to ambiguities in parsing (note that some implementations
put the heading inside the blockquote, while others do not).
(John Gruber has also spoken [in favor of requiring the blank
lines](http://article.gmane.org/gmane.text.markdown.general/2146).)
lines](https://web.archive.org/web/20170611172104/http://article.gmane.org/gmane.text.markdown.general/2146).)
3. Is a blank line needed before an indented code block?
(`Markdown.pl` requires it, but this is not mentioned in the
@@ -171,7 +171,7 @@ questions it does not answer:
```
(There are some relevant comments by John Gruber
[here](http://article.gmane.org/gmane.text.markdown.general/2554).)
[here](https://web.archive.org/web/20170611172104/http://article.gmane.org/gmane.text.markdown.general/2554).)
5. Can list markers be indented? Can ordered list markers be right-aligned?
@@ -1001,10 +1001,7 @@ interpretable as a [code fence], [ATX heading][ATX headings],
A [setext heading underline](@) is a sequence of
`=` characters or a sequence of `-` characters, with no more than 3
spaces indentation and any number of trailing spaces. If a line
containing a single `-` can be interpreted as an
empty [list items], it should be interpreted this way
and not as a [setext heading underline].
spaces of indentation and any number of trailing spaces or tabs.
The heading is a level 1 heading if `=` characters are used in
the [setext heading underline], and a level 2 heading if `-`
@@ -1638,7 +1635,7 @@ has been found, the code block contains all of the lines after the
opening code fence until the end of the containing block (or
document). (An alternative spec would require backtracking in the
event that a closing code fence is not found. But this makes parsing
much less efficient, and there seems to be no real down side to the
much less efficient, and there seems to be no real downside to the
behavior described here.)
A fenced code block may interrupt a paragraph, and does not require
@@ -2068,7 +2065,7 @@ followed by an uppercase ASCII letter.\
`<![CDATA[`.\
**End condition:** line contains the string `]]>`.
6. **Start condition:** line begins the string `<` or `</`
6. **Start condition:** line begins with the string `<` or `</`
followed by one of the strings (case-insensitive) `address`,
`article`, `aside`, `base`, `basefont`, `blockquote`, `body`,
`caption`, `center`, `col`, `colgroup`, `dd`, `details`, `dialog`,
@@ -5279,7 +5276,7 @@ well. ([reStructuredText](http://docutils.sourceforge.net/rst.html)
takes a different approach, requiring blank lines before lists
even inside other list items.)
In order to solve of unwanted lists in paragraphs with
In order to solve the problem of unwanted lists in paragraphs with
hard-wrapped numerals, we allow only lists starting with `1` to
interrupt paragraphs. Thus,
@@ -9410,10 +9407,9 @@ character, and a `>` character.
A [closing tag](@) consists of the string `</`, a
[tag name], optional [whitespace], and the character `>`.
An [HTML comment](@) consists of `<!--` + *text* + `-->`,
where *text* does not start with `>` or `->`, does not end with `-`,
and does not contain `--`. (See the
[HTML5 spec](http://www.w3.org/TR/html5/syntax.html#comments).)
An [HTML comment](@) consists of `<!-->`, `<!--->`, or `<!--`, a string of
characters not including the string `-->`, and `-->` (see the
[HTML spec](https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state)).
A [processing instruction](@)
consists of the string `<?`, a string
@@ -9554,30 +9550,20 @@ Illegal attributes in closing tag:
Comments:
```````````````````````````````` example
foo <!-- this is a
comment - with hyphen -->
foo <!-- this is a --
comment - with hyphens -->
.
<p>foo <!-- this is a
comment - with hyphen --></p>
<p>foo <!-- this is a --
comment - with hyphens --></p>
````````````````````````````````
```````````````````````````````` example
foo <!-- not a comment -- two hyphens -->
.
<p>foo &lt;!-- not a comment -- two hyphens --&gt;</p>
````````````````````````````````
Not comments:
```````````````````````````````` example
foo <!--> foo -->
foo <!-- foo--->
foo <!---> foo -->
.
<p>foo &lt;!--&gt; foo --&gt;</p>
<p>foo &lt;!-- foo---&gt;</p>
<p>foo <!--> foo --&gt;</p>
<p>foo <!---> foo --&gt;</p>
````````````````````````````````

View File

@@ -12,7 +12,7 @@ RUN apt-get update && apt-get install -y \
wget \
clang \
man \
clang-format-3.5 \
clang-format \
&& apt-get clean
RUN wget http://lcamtuf.coredump.cx/afl/releases/afl-latest.tgz && \