mirror of
https://github.com/swiftlang/swift-cmark.git
synced 2026-01-18 17:31:20 +01:00
Merge remote-tracking branch 'github/master' into vmitchell/sync-upstream-gfm.7
rdar://104622655
This commit is contained in:
1
.gitignore
vendored
1
.gitignore
vendored
@@ -33,6 +33,7 @@ build
|
||||
cmark.dSYM/*
|
||||
cmark
|
||||
.vscode
|
||||
.DS_Store
|
||||
|
||||
# Testing and benchmark
|
||||
alltests.md
|
||||
|
||||
@@ -31,6 +31,16 @@ set(CMAKE_C_STANDARD_REQUIRED YES)
|
||||
# Use CMake's generated headers instead of the Swift package prebuilt ones
|
||||
add_compile_definitions(CMARK_USE_CMAKE_HEADERS)
|
||||
|
||||
option(CMARK_FUZZ_QUADRATIC "Build quadratic fuzzing harness" OFF)
|
||||
|
||||
if(CMARK_FUZZ_QUADRATIC)
|
||||
set(FUZZER_FLAGS "-fsanitize=fuzzer-no-link,address -g")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${FUZZER_FLAGS}")
|
||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${FUZZER_FLAGS}")
|
||||
set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${FUZZER_FLAGS}")
|
||||
set(CMAKE_MODULE_LINKER_FLAGS "${CMAKE_MODULE_LINKER_FLAGS} ${FUZZER_FLAGS}")
|
||||
endif()
|
||||
|
||||
add_subdirectory(src)
|
||||
add_subdirectory(extensions)
|
||||
if(CMARK_TESTS AND (CMARK_SHARED OR CMARK_STATIC))
|
||||
@@ -41,6 +51,9 @@ if(CMARK_TESTS)
|
||||
enable_testing()
|
||||
add_subdirectory(test testdir)
|
||||
endif()
|
||||
if(CMARK_FUZZ_QUADRATIC)
|
||||
add_subdirectory(fuzz)
|
||||
endif()
|
||||
|
||||
if(NOT CMAKE_BUILD_TYPE)
|
||||
set(CMAKE_BUILD_TYPE "Release" CACHE STRING
|
||||
|
||||
6
Makefile
6
Makefile
@@ -22,7 +22,7 @@ VERSION?=$(SPECVERSION)
|
||||
RELEASE?=CommonMark-$(VERSION)
|
||||
INSTALL_PREFIX?=/usr/local
|
||||
CLANG_CHECK?=clang-check
|
||||
CLANG_FORMAT=clang-format-3.5 -style llvm -sort-includes=0 -i
|
||||
CLANG_FORMAT=clang-format -style llvm -sort-includes=0 -i
|
||||
AFL_PATH?=/usr/local/bin
|
||||
|
||||
.PHONY: all cmake_build leakcheck clean fuzztest test debug ubsan asan mingw archive newbench bench format update-spec afl clang-check docker libFuzzer
|
||||
@@ -140,7 +140,7 @@ $(EXTDIR)/ext_scanners.c: $(EXTDIR)/ext_scanners.re
|
||||
esac
|
||||
re2c --case-insensitive -b -i --no-generation-date -8 \
|
||||
--encoding-policy substitute -o $@ $<
|
||||
clang-format-3.5 -style llvm -i $@
|
||||
clang-format -style llvm -i $@
|
||||
|
||||
# We include entities.inc in the repository, so normally this
|
||||
# doesn't need to be regenerated:
|
||||
@@ -211,7 +211,7 @@ format:
|
||||
$(CLANG_FORMAT) src/*.c src/*.h api_test/*.c api_test/*.h
|
||||
|
||||
format-extensions:
|
||||
clang-format-3.5 -style llvm -i extensions/*.c extensions/*.h
|
||||
clang-format -style llvm -i extensions/*.c extensions/*.h
|
||||
|
||||
operf: $(CMARK)
|
||||
operf $< < $(BENCHFILE) > /dev/null
|
||||
|
||||
@@ -1575,6 +1575,7 @@ int main() {
|
||||
int retval;
|
||||
test_batch_runner *runner = test_batch_runner_new();
|
||||
|
||||
cmark_init_standard_node_flags();
|
||||
version(runner);
|
||||
constructor(runner);
|
||||
accessors(runner);
|
||||
|
||||
@@ -143,6 +143,7 @@ int main(int argc, char *argv[]) {
|
||||
}
|
||||
#endif
|
||||
|
||||
cmark_init_standard_node_flags();
|
||||
cmark_gfm_core_extensions_ensure_registered();
|
||||
|
||||
#ifdef USE_PLEDGE
|
||||
|
||||
@@ -1,3 +1,20 @@
|
||||
[0.29.0.gfm.7]
|
||||
|
||||
* Fixed a polynomial time complexity issue per
|
||||
https://github.com/github/cmark-gfm/security/advisories/GHSA-r572-jvj2-3m8p
|
||||
* Fixed an issue in which crafted markdown document could trigger an
|
||||
out-of-bounds read in the validate_protocol function per
|
||||
https://github.com/github/cmark-gfm/security/advisories/GHSA-c944-cv5f-hpvr
|
||||
* Fixed a polynomial time complexity issue
|
||||
https://github.com/github/cmark-gfm/security/advisories/GHSA-24f7-9frr-5h2r
|
||||
* Fixed several polynomial time complexity issues per
|
||||
https://github.com/github/cmark-gfm/security/advisories/GHSA-29g3-96g3-jg6c
|
||||
* We removed an unneeded .DS_Store file (#291)
|
||||
* We added a test for domains with underscores and fix roundtrip behavior (#292)
|
||||
* We now use an up-to-date clang-format (#294)
|
||||
* We made a variety of implicit integer trunctions explicit by moving to
|
||||
size_t as our standard size integer type (#302)
|
||||
|
||||
[0.29.0.gfm.6]
|
||||
* Fixed polynomial time complexity DoS vulnerability in autolink extension
|
||||
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
#include <parser.h>
|
||||
#include <string.h>
|
||||
#include <utf8.h>
|
||||
#include <stddef.h>
|
||||
|
||||
#if defined(_WIN32)
|
||||
#define strncasecmp _strnicmp
|
||||
@@ -35,30 +36,63 @@ static int sd_autolink_issafe(const uint8_t *link, size_t link_len) {
|
||||
}
|
||||
|
||||
static size_t autolink_delim(uint8_t *data, size_t link_end) {
|
||||
uint8_t cclose, copen;
|
||||
size_t i;
|
||||
size_t closing = 0;
|
||||
size_t opening = 0;
|
||||
|
||||
for (i = 0; i < link_end; ++i)
|
||||
if (data[i] == '<') {
|
||||
for (i = 0; i < link_end; ++i) {
|
||||
const uint8_t c = data[i];
|
||||
if (c == '<') {
|
||||
link_end = i;
|
||||
break;
|
||||
} else if (c == '(') {
|
||||
opening++;
|
||||
} else if (c == ')') {
|
||||
closing++;
|
||||
}
|
||||
}
|
||||
|
||||
while (link_end > 0) {
|
||||
cclose = data[link_end - 1];
|
||||
|
||||
switch (cclose) {
|
||||
switch (data[link_end - 1]) {
|
||||
case ')':
|
||||
copen = '(';
|
||||
break;
|
||||
default:
|
||||
copen = 0;
|
||||
}
|
||||
|
||||
if (strchr("?!.,:*_~'\"", data[link_end - 1]) != NULL)
|
||||
/* Allow any number of matching brackets (as recognised in copen/cclose)
|
||||
* at the end of the URL. If there is a greater number of closing
|
||||
* brackets than opening ones, we remove one character from the end of
|
||||
* the link.
|
||||
*
|
||||
* Examples (input text => output linked portion):
|
||||
*
|
||||
* http://www.pokemon.com/Pikachu_(Electric)
|
||||
* => http://www.pokemon.com/Pikachu_(Electric)
|
||||
*
|
||||
* http://www.pokemon.com/Pikachu_((Electric)
|
||||
* => http://www.pokemon.com/Pikachu_((Electric)
|
||||
*
|
||||
* http://www.pokemon.com/Pikachu_(Electric))
|
||||
* => http://www.pokemon.com/Pikachu_(Electric)
|
||||
*
|
||||
* http://www.pokemon.com/Pikachu_((Electric))
|
||||
* => http://www.pokemon.com/Pikachu_((Electric))
|
||||
*/
|
||||
if (closing <= opening) {
|
||||
return link_end;
|
||||
}
|
||||
closing--;
|
||||
link_end--;
|
||||
|
||||
else if (data[link_end - 1] == ';') {
|
||||
break;
|
||||
case '?':
|
||||
case '!':
|
||||
case '.':
|
||||
case ',':
|
||||
case ':':
|
||||
case '*':
|
||||
case '_':
|
||||
case '~':
|
||||
case '\'':
|
||||
case '"':
|
||||
link_end--;
|
||||
break;
|
||||
case ';': {
|
||||
size_t new_end = link_end - 2;
|
||||
|
||||
while (new_end > 0 && cmark_isalpha(data[new_end]))
|
||||
@@ -68,46 +102,12 @@ static size_t autolink_delim(uint8_t *data, size_t link_end) {
|
||||
link_end = new_end;
|
||||
else
|
||||
link_end--;
|
||||
} else if (copen != 0) {
|
||||
size_t closing = 0;
|
||||
size_t opening = 0;
|
||||
i = 0;
|
||||
|
||||
/* Allow any number of matching brackets (as recognised in copen/cclose)
|
||||
* at the end of the URL. If there is a greater number of closing
|
||||
* brackets than opening ones, we remove one character from the end of
|
||||
* the link.
|
||||
*
|
||||
* Examples (input text => output linked portion):
|
||||
*
|
||||
* http://www.pokemon.com/Pikachu_(Electric)
|
||||
* => http://www.pokemon.com/Pikachu_(Electric)
|
||||
*
|
||||
* http://www.pokemon.com/Pikachu_((Electric)
|
||||
* => http://www.pokemon.com/Pikachu_((Electric)
|
||||
*
|
||||
* http://www.pokemon.com/Pikachu_(Electric))
|
||||
* => http://www.pokemon.com/Pikachu_(Electric)
|
||||
*
|
||||
* http://www.pokemon.com/Pikachu_((Electric))
|
||||
* => http://www.pokemon.com/Pikachu_((Electric))
|
||||
*/
|
||||
|
||||
while (i < link_end) {
|
||||
if (data[i] == copen)
|
||||
opening++;
|
||||
else if (data[i] == cclose)
|
||||
closing++;
|
||||
|
||||
i++;
|
||||
}
|
||||
|
||||
if (closing <= opening)
|
||||
break;
|
||||
|
||||
link_end--;
|
||||
} else
|
||||
break;
|
||||
}
|
||||
|
||||
default:
|
||||
return link_end;
|
||||
}
|
||||
}
|
||||
|
||||
return link_end;
|
||||
@@ -116,7 +116,20 @@ static size_t autolink_delim(uint8_t *data, size_t link_end) {
|
||||
static size_t check_domain(uint8_t *data, size_t size, int allow_short) {
|
||||
size_t i, np = 0, uscore1 = 0, uscore2 = 0;
|
||||
|
||||
/* The purpose of this code is to reject urls that contain an underscore
|
||||
* in one of the last two segments. Examples:
|
||||
*
|
||||
* www.xxx.yyy.zzz autolinked
|
||||
* www.xxx.yyy._zzz not autolinked
|
||||
* www.xxx._yyy.zzz not autolinked
|
||||
* www._xxx.yyy.zzz autolinked
|
||||
*
|
||||
* The reason is that domain names are allowed to include underscores,
|
||||
* but host names are not. See: https://stackoverflow.com/a/2183140
|
||||
*/
|
||||
for (i = 1; i < size - 1; i++) {
|
||||
if (data[i] == '\\' && i < size - 2)
|
||||
i++;
|
||||
if (data[i] == '_')
|
||||
uscore2++;
|
||||
else if (data[i] == '.') {
|
||||
@@ -127,8 +140,17 @@ static size_t check_domain(uint8_t *data, size_t size, int allow_short) {
|
||||
break;
|
||||
}
|
||||
|
||||
if (uscore1 > 0 || uscore2 > 0)
|
||||
return 0;
|
||||
if (uscore1 > 0 || uscore2 > 0) {
|
||||
/* If the url is very long then accept it despite the underscores,
|
||||
* to avoid quadratic behavior causing a denial of service. See:
|
||||
* https://github.com/github/cmark-gfm/security/advisories/GHSA-29g3-96g3-jg6c
|
||||
* Reasonable urls are unlikely to have more than 10 segments, so
|
||||
* this extra condition shouldn't have any impact on normal usage.
|
||||
*/
|
||||
if (np <= 10) {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
if (allow_short) {
|
||||
/* We don't need a valid domain in the strict sense (with
|
||||
@@ -165,7 +187,7 @@ static cmark_node *www_match(cmark_parser *parser, cmark_node *parent,
|
||||
if (link_end == 0)
|
||||
return NULL;
|
||||
|
||||
while (link_end < size && !cmark_isspace(data[link_end]))
|
||||
while (link_end < size && !cmark_isspace(data[link_end]) && data[link_end] != '<')
|
||||
link_end++;
|
||||
|
||||
link_end = autolink_delim(data, link_end);
|
||||
@@ -225,7 +247,7 @@ static cmark_node *url_match(cmark_parser *parser, cmark_node *parent,
|
||||
return 0;
|
||||
|
||||
link_end += domain_len;
|
||||
while (link_end < size && !cmark_isspace(data[link_end]))
|
||||
while (link_end < size && !cmark_isspace(data[link_end]) && data[link_end] != '<')
|
||||
link_end++;
|
||||
|
||||
link_end = autolink_delim(data, link_end);
|
||||
@@ -269,142 +291,167 @@ static cmark_node *match(cmark_syntax_extension *ext, cmark_parser *parser,
|
||||
// inline was finished in inlines.c.
|
||||
}
|
||||
|
||||
static bool validate_protocol(char protocol[], uint8_t *data, int rewind) {
|
||||
static bool validate_protocol(char protocol[], uint8_t *data, size_t rewind, size_t max_rewind) {
|
||||
size_t len = strlen(protocol);
|
||||
|
||||
// Check that the protocol matches
|
||||
for (int i = 1; i <= len; i++) {
|
||||
if (data[-rewind - i] != protocol[len - i]) {
|
||||
return false;
|
||||
}
|
||||
if (len > (max_rewind - rewind)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
char prev_char = data[-rewind - len - 1];
|
||||
// Check that the protocol matches
|
||||
if (memcmp(data - rewind - len, protocol, len) != 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (len == (max_rewind - rewind)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
char prev_char = data[-((ptrdiff_t)rewind) - len - 1];
|
||||
|
||||
// Make sure the character before the protocol is non-alphanumeric
|
||||
return !cmark_isalnum(prev_char);
|
||||
}
|
||||
|
||||
static void postprocess_text(cmark_parser *parser, cmark_node *text, int offset, int depth) {
|
||||
// postprocess_text can recurse very deeply if there is a very long line of
|
||||
// '@' only. Stop at a reasonable depth to ensure it cannot crash.
|
||||
if (depth > 1000) return;
|
||||
static void postprocess_text(cmark_parser *parser, cmark_node *text) {
|
||||
size_t start = 0;
|
||||
size_t offset = 0;
|
||||
// `text` is going to be split into a list of nodes containing shorter segments
|
||||
// of text, so we detach the memory buffer from text and use `cmark_chunk_dup` to
|
||||
// create references to it. Later, `cmark_chunk_to_cstr` is used to convert
|
||||
// the references into allocated buffers. The detached buffer is freed before we
|
||||
// return.
|
||||
cmark_chunk detached_chunk = text->as.literal;
|
||||
text->as.literal = cmark_chunk_dup(&detached_chunk, 0, detached_chunk.len);
|
||||
|
||||
size_t link_end;
|
||||
uint8_t *data = text->as.literal.data,
|
||||
*at;
|
||||
size_t size = text->as.literal.len;
|
||||
bool auto_mailto = true;
|
||||
bool is_xmpp = false;
|
||||
int rewind, max_rewind,
|
||||
nb = 0, np = 0, ns = 0;
|
||||
uint8_t *data = text->as.literal.data;
|
||||
size_t remaining = text->as.literal.len;
|
||||
|
||||
if (offset < 0 || (size_t)offset >= size)
|
||||
return;
|
||||
while (true) {
|
||||
size_t link_end;
|
||||
uint8_t *at;
|
||||
bool auto_mailto = true;
|
||||
bool is_xmpp = false;
|
||||
size_t rewind;
|
||||
size_t max_rewind;
|
||||
size_t np = 0;
|
||||
|
||||
data += offset;
|
||||
size -= offset;
|
||||
if (offset >= remaining)
|
||||
break;
|
||||
|
||||
at = (uint8_t *)memchr(data, '@', size);
|
||||
if (!at)
|
||||
return;
|
||||
at = (uint8_t *)memchr(data + start + offset, '@', remaining - offset);
|
||||
if (!at)
|
||||
break;
|
||||
|
||||
max_rewind = (int)(at - data);
|
||||
data += max_rewind;
|
||||
size -= max_rewind;
|
||||
max_rewind = at - (data + start + offset);
|
||||
|
||||
for (rewind = 0; rewind < max_rewind; ++rewind) {
|
||||
uint8_t c = data[-rewind - 1];
|
||||
found_at:
|
||||
for (rewind = 0; rewind < max_rewind; ++rewind) {
|
||||
uint8_t c = data[start + offset + max_rewind - rewind - 1];
|
||||
|
||||
if (cmark_isalnum(c))
|
||||
continue;
|
||||
|
||||
if (strchr(".+-_", c) != NULL)
|
||||
continue;
|
||||
|
||||
if (strchr(":", c) != NULL) {
|
||||
if (validate_protocol("mailto:", data, rewind)) {
|
||||
auto_mailto = false;
|
||||
if (cmark_isalnum(c))
|
||||
continue;
|
||||
|
||||
if (strchr(".+-_", c) != NULL)
|
||||
continue;
|
||||
|
||||
if (strchr(":", c) != NULL) {
|
||||
if (validate_protocol("mailto:", data + start + offset + max_rewind, rewind, max_rewind)) {
|
||||
auto_mailto = false;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (validate_protocol("xmpp:", data + start + offset + max_rewind, rewind, max_rewind)) {
|
||||
auto_mailto = false;
|
||||
is_xmpp = true;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
if (validate_protocol("xmpp:", data, rewind)) {
|
||||
auto_mailto = false;
|
||||
is_xmpp = true;
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
if (rewind == 0 || ns > 0) {
|
||||
postprocess_text(parser, text, max_rewind + 1 + offset, depth + 1);
|
||||
return;
|
||||
}
|
||||
|
||||
for (link_end = 0; link_end < size; ++link_end) {
|
||||
uint8_t c = data[link_end];
|
||||
|
||||
if (cmark_isalnum(c))
|
||||
if (rewind == 0) {
|
||||
offset += max_rewind + 1;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (c == '@')
|
||||
nb++;
|
||||
else if (c == '.' && link_end < size - 1 && cmark_isalnum(data[link_end + 1]))
|
||||
np++;
|
||||
else if (c == '/' && is_xmpp)
|
||||
assert(data[start + offset + max_rewind] == '@');
|
||||
for (link_end = 1; link_end < remaining - offset - max_rewind; ++link_end) {
|
||||
uint8_t c = data[start + offset + max_rewind + link_end];
|
||||
|
||||
if (cmark_isalnum(c))
|
||||
continue;
|
||||
|
||||
if (c == '@') {
|
||||
// Found another '@', so go back and try again with an updated offset and max_rewind.
|
||||
offset += max_rewind + 1;
|
||||
max_rewind = link_end - 1;
|
||||
goto found_at;
|
||||
} else if (c == '.' && link_end < remaining - offset - max_rewind - 1 &&
|
||||
cmark_isalnum(data[start + offset + max_rewind + link_end + 1]))
|
||||
np++;
|
||||
else if (c == '/' && is_xmpp)
|
||||
continue;
|
||||
else if (c != '-' && c != '_')
|
||||
break;
|
||||
}
|
||||
|
||||
if (link_end < 2 || np == 0 ||
|
||||
(!cmark_isalpha(data[start + offset + max_rewind + link_end - 1]) &&
|
||||
data[start + offset + max_rewind + link_end - 1] != '.')) {
|
||||
offset += max_rewind + link_end;
|
||||
continue;
|
||||
else if (c != '-' && c != '_')
|
||||
break;
|
||||
}
|
||||
|
||||
if (link_end < 2 || nb != 1 || np == 0 ||
|
||||
(!cmark_isalpha(data[link_end - 1]) && data[link_end - 1] != '.')) {
|
||||
postprocess_text(parser, text, max_rewind + 1 + offset, depth + 1);
|
||||
return;
|
||||
}
|
||||
|
||||
link_end = autolink_delim(data, link_end);
|
||||
|
||||
if (link_end == 0) {
|
||||
postprocess_text(parser, text, max_rewind + 1 + offset, depth + 1);
|
||||
return;
|
||||
}
|
||||
|
||||
link_end = autolink_delim(data + start + offset + max_rewind, link_end);
|
||||
|
||||
if (link_end == 0) {
|
||||
offset += max_rewind + 1;
|
||||
continue;
|
||||
}
|
||||
|
||||
cmark_node *link_node = cmark_node_new_with_mem(CMARK_NODE_LINK, parser->mem);
|
||||
cmark_strbuf buf;
|
||||
cmark_strbuf_init(parser->mem, &buf, 10);
|
||||
if (auto_mailto)
|
||||
cmark_strbuf_puts(&buf, "mailto:");
|
||||
cmark_strbuf_put(&buf, data + start + offset + max_rewind - rewind, (bufsize_t)(link_end + rewind));
|
||||
link_node->as.link.url = cmark_chunk_buf_detach(&buf);
|
||||
|
||||
cmark_node *link_text = cmark_node_new_with_mem(CMARK_NODE_TEXT, parser->mem);
|
||||
cmark_chunk email = cmark_chunk_dup(
|
||||
&detached_chunk,
|
||||
(bufsize_t)(start + offset + max_rewind - rewind),
|
||||
(bufsize_t)(link_end + rewind));
|
||||
cmark_chunk_to_cstr(parser->mem, &email);
|
||||
link_text->as.literal = email;
|
||||
cmark_node_append_child(link_node, link_text);
|
||||
|
||||
cmark_node_insert_after(text, link_node);
|
||||
|
||||
cmark_node *post = cmark_node_new_with_mem(CMARK_NODE_TEXT, parser->mem);
|
||||
post->as.literal = cmark_chunk_dup(&detached_chunk,
|
||||
(bufsize_t)(start + offset + max_rewind + link_end),
|
||||
(bufsize_t)(remaining - offset - max_rewind - link_end));
|
||||
|
||||
cmark_node_insert_after(link_node, post);
|
||||
|
||||
text->as.literal = cmark_chunk_dup(&detached_chunk, (bufsize_t)start, (bufsize_t)(offset + max_rewind - rewind));
|
||||
cmark_chunk_to_cstr(parser->mem, &text->as.literal);
|
||||
|
||||
text = post;
|
||||
start += offset + max_rewind + link_end;
|
||||
remaining -= offset + max_rewind + link_end;
|
||||
offset = 0;
|
||||
}
|
||||
|
||||
// Convert the reference to allocated memory.
|
||||
assert(!text->as.literal.alloc);
|
||||
cmark_chunk_to_cstr(parser->mem, &text->as.literal);
|
||||
|
||||
cmark_node *link_node = cmark_node_new_with_mem(CMARK_NODE_LINK, parser->mem);
|
||||
cmark_strbuf buf;
|
||||
cmark_strbuf_init(parser->mem, &buf, 10);
|
||||
if (auto_mailto)
|
||||
cmark_strbuf_puts(&buf, "mailto:");
|
||||
cmark_strbuf_put(&buf, data - rewind, (bufsize_t)(link_end + rewind));
|
||||
link_node->as.link.url = cmark_chunk_buf_detach(&buf);
|
||||
|
||||
cmark_node *link_text = cmark_node_new_with_mem(CMARK_NODE_TEXT, parser->mem);
|
||||
cmark_chunk email = cmark_chunk_dup(
|
||||
&text->as.literal,
|
||||
offset + max_rewind - rewind,
|
||||
(bufsize_t)(link_end + rewind));
|
||||
cmark_chunk_to_cstr(parser->mem, &email);
|
||||
link_text->as.literal = email;
|
||||
cmark_node_append_child(link_node, link_text);
|
||||
|
||||
cmark_node_insert_after(text, link_node);
|
||||
|
||||
cmark_node *post = cmark_node_new_with_mem(CMARK_NODE_TEXT, parser->mem);
|
||||
post->as.literal = cmark_chunk_dup(&text->as.literal,
|
||||
(bufsize_t)(offset + max_rewind + link_end),
|
||||
(bufsize_t)(size - link_end));
|
||||
cmark_chunk_to_cstr(parser->mem, &post->as.literal);
|
||||
|
||||
cmark_node_insert_after(link_node, post);
|
||||
|
||||
text->as.literal.len = offset + max_rewind - rewind;
|
||||
text->as.literal.data[text->as.literal.len] = 0;
|
||||
|
||||
postprocess_text(parser, post, 0, depth + 1);
|
||||
// Free the detached buffer.
|
||||
cmark_chunk_free(parser->mem, &detached_chunk);
|
||||
}
|
||||
|
||||
static cmark_node *postprocess(cmark_syntax_extension *ext, cmark_parser *parser, cmark_node *root) {
|
||||
@@ -431,7 +478,7 @@ static cmark_node *postprocess(cmark_syntax_extension *ext, cmark_parser *parser
|
||||
}
|
||||
|
||||
if (ev == CMARK_EVENT_ENTER && node->type == CMARK_NODE_TEXT) {
|
||||
postprocess_text(parser, node, 0, /*depth*/0);
|
||||
postprocess_text(parser, node);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -67,6 +67,7 @@ static delimiter *insert(cmark_syntax_extension *self, cmark_parser *parser,
|
||||
strikethrough->end_column = closer->inl_text->start_column + closer->inl_text->as.literal.len - 1;
|
||||
cmark_node_free(closer->inl_text);
|
||||
|
||||
done:
|
||||
delim = closer;
|
||||
while (delim != NULL && delim != opener) {
|
||||
tmp_delim = delim->previous;
|
||||
@@ -76,7 +77,6 @@ static delimiter *insert(cmark_syntax_extension *self, cmark_parser *parser,
|
||||
|
||||
cmark_inline_parser_remove_delimiter(inline_parser, opener);
|
||||
|
||||
done:
|
||||
return res;
|
||||
}
|
||||
|
||||
|
||||
@@ -11,24 +11,12 @@
|
||||
#include "table.h"
|
||||
#include "cmark-gfm-core-extensions.h"
|
||||
|
||||
// Custom node flag, initialized in `create_table_extension`.
|
||||
static cmark_node__internal_flags CMARK_NODE__TABLE_VISITED;
|
||||
|
||||
cmark_node_type CMARK_NODE_TABLE, CMARK_NODE_TABLE_ROW,
|
||||
CMARK_NODE_TABLE_CELL;
|
||||
|
||||
typedef struct {
|
||||
uint16_t n_columns;
|
||||
int paragraph_offset;
|
||||
cmark_llist *cells;
|
||||
} table_row;
|
||||
|
||||
typedef struct {
|
||||
uint16_t n_columns;
|
||||
uint8_t *alignments;
|
||||
} node_table;
|
||||
|
||||
typedef struct {
|
||||
bool is_header;
|
||||
} node_table_row;
|
||||
|
||||
typedef struct {
|
||||
unsigned colspan, rowspan;
|
||||
} node_cell_data;
|
||||
@@ -39,21 +27,41 @@ typedef struct {
|
||||
node_cell_data *cell_data;
|
||||
} node_cell;
|
||||
|
||||
static void free_table_cell(cmark_mem *mem, void *data) {
|
||||
node_cell *cell = (node_cell *)data;
|
||||
typedef struct {
|
||||
uint16_t n_columns;
|
||||
int paragraph_offset;
|
||||
node_cell *cells;
|
||||
} table_row;
|
||||
|
||||
typedef struct {
|
||||
uint16_t n_columns;
|
||||
uint8_t *alignments;
|
||||
} node_table;
|
||||
|
||||
typedef struct {
|
||||
bool is_header;
|
||||
} node_table_row;
|
||||
|
||||
static void free_table_cell(cmark_mem *mem, node_cell *cell) {
|
||||
cmark_strbuf_free((cmark_strbuf *)cell->buf);
|
||||
mem->free(cell->buf);
|
||||
if (cell->cell_data)
|
||||
mem->free(cell->cell_data);
|
||||
mem->free(cell);
|
||||
}
|
||||
|
||||
static void free_row_cells(cmark_mem *mem, table_row *row) {
|
||||
while (row->n_columns > 0) {
|
||||
free_table_cell(mem, &row->cells[--row->n_columns]);
|
||||
}
|
||||
mem->free(row->cells);
|
||||
row->cells = NULL;
|
||||
}
|
||||
|
||||
static void free_table_row(cmark_mem *mem, table_row *row) {
|
||||
if (!row)
|
||||
return;
|
||||
|
||||
cmark_llist_free_full(mem, row->cells, (cmark_free_func)free_table_cell);
|
||||
|
||||
free_row_cells(mem, row);
|
||||
mem->free(row);
|
||||
}
|
||||
|
||||
@@ -175,6 +183,24 @@ static cmark_strbuf *unescape_pipes(cmark_mem *mem, unsigned char *string, bufsi
|
||||
return res;
|
||||
}
|
||||
|
||||
// Adds a new cell to the end of the row. A pointer to the new cell is returned
|
||||
// for the caller to initialize.
|
||||
static node_cell* append_row_cell(cmark_mem *mem, table_row *row) {
|
||||
const uint32_t n_columns = row->n_columns + 1;
|
||||
// realloc when n_columns is a power of 2
|
||||
if ((n_columns & (n_columns-1)) == 0) {
|
||||
// make sure we never wrap row->n_columns
|
||||
// offset will != len and our exit will clean up as intended
|
||||
if (n_columns > UINT16_MAX) {
|
||||
return NULL;
|
||||
}
|
||||
// Use realloc to double the size of the buffer.
|
||||
row->cells = (node_cell *)mem->realloc(row->cells, (2 * n_columns - 1) * sizeof(node_cell));
|
||||
}
|
||||
row->n_columns = (uint16_t)n_columns;
|
||||
return &row->cells[n_columns-1];
|
||||
}
|
||||
|
||||
static table_row *row_from_string(cmark_syntax_extension *self,
|
||||
cmark_parser *parser, unsigned char *string,
|
||||
int len) {
|
||||
@@ -216,15 +242,22 @@ static table_row *row_from_string(cmark_syntax_extension *self,
|
||||
cell_matched);
|
||||
cmark_strbuf_trim(cell_buf);
|
||||
|
||||
node_cell *cell = (node_cell *)parser->mem->calloc(1, sizeof(*cell));
|
||||
node_cell *cell = append_row_cell(parser->mem, row);
|
||||
if (!cell) {
|
||||
int_overflow_abort = 1;
|
||||
cmark_strbuf_free(cell_buf);
|
||||
parser->mem->free(cell_buf);
|
||||
break;
|
||||
}
|
||||
cell->buf = cell_buf;
|
||||
cell->start_offset = offset;
|
||||
if (cell_matched > 0)
|
||||
cell->end_offset = offset + cell_matched - 1;
|
||||
else
|
||||
cell->end_offset = offset;
|
||||
cell->internal_offset = 0;
|
||||
|
||||
while (cell->start_offset > 0 && string[cell->start_offset - 1] != '|') {
|
||||
while (cell->start_offset > row->paragraph_offset && string[cell->start_offset - 1] != '|') {
|
||||
--cell->start_offset;
|
||||
++cell->internal_offset;
|
||||
}
|
||||
@@ -237,13 +270,11 @@ static table_row *row_from_string(cmark_syntax_extension *self,
|
||||
cell->cell_data->colspan = 0;
|
||||
|
||||
// find the last cell that isn't part of a colspan, and increment that colspan
|
||||
cmark_llist *tmp = row->cells;
|
||||
node_cell *colspan_cell = NULL;
|
||||
while (tmp) {
|
||||
node_cell *this_cell = (node_cell *)tmp->data;
|
||||
for (uint16_t i = 0; i < row->n_columns; i++) {
|
||||
node_cell *this_cell = &row->cells[i];
|
||||
if (this_cell->cell_data->colspan > 0)
|
||||
colspan_cell = this_cell;
|
||||
tmp = tmp->next;
|
||||
}
|
||||
if (colspan_cell)
|
||||
++colspan_cell->cell_data->colspan;
|
||||
@@ -272,8 +303,6 @@ static table_row *row_from_string(cmark_syntax_extension *self,
|
||||
int_overflow_abort = 1;
|
||||
break;
|
||||
}
|
||||
row->n_columns += 1;
|
||||
row->cells = cmark_llist_append(parser->mem, row->cells, cell);
|
||||
}
|
||||
|
||||
offset += cell_matched + pipe_matched;
|
||||
@@ -291,9 +320,7 @@ static table_row *row_from_string(cmark_syntax_extension *self,
|
||||
if (row_end_offset && offset != len) {
|
||||
row->paragraph_offset = offset;
|
||||
|
||||
cmark_llist_free_full(parser->mem, row->cells, (cmark_free_func)free_table_cell);
|
||||
row->cells = NULL;
|
||||
row->n_columns = 0;
|
||||
free_row_cells(parser->mem, row);
|
||||
|
||||
// Scan past the (optional) leading pipe.
|
||||
offset += scan_table_cell_end(string, len, offset);
|
||||
@@ -344,6 +371,10 @@ static cmark_node *try_opening_table_header(cmark_syntax_extension *self,
|
||||
const char *parent_string;
|
||||
uint16_t i;
|
||||
|
||||
if (parent_container->flags & CMARK_NODE__TABLE_VISITED) {
|
||||
return parent_container;
|
||||
}
|
||||
|
||||
if (!scan_table_start(input, len, cmark_parser_get_first_nonspace(parser))) {
|
||||
return parent_container;
|
||||
}
|
||||
@@ -371,6 +402,7 @@ static cmark_node *try_opening_table_header(cmark_syntax_extension *self,
|
||||
free_table_row(parser->mem, marker_row);
|
||||
free_table_row(parser->mem, header_row);
|
||||
cmark_arena_pop();
|
||||
parent_container->flags |= CMARK_NODE__TABLE_VISITED;
|
||||
return parent_container;
|
||||
}
|
||||
|
||||
@@ -407,9 +439,8 @@ static cmark_node *try_opening_table_header(cmark_syntax_extension *self,
|
||||
// since we populate the alignments array based on marker_row->cells
|
||||
uint8_t *alignments =
|
||||
(uint8_t *)parser->mem->calloc(marker_row->n_columns, sizeof(uint8_t));
|
||||
cmark_llist *it = marker_row->cells;
|
||||
for (i = 0; it; it = it->next, ++i) {
|
||||
node_cell *node = (node_cell *)it->data;
|
||||
for (i = 0; i < marker_row->n_columns; ++i) {
|
||||
node_cell *node = &marker_row->cells[i];
|
||||
bool left = node->buf->ptr[0] == ':', right = node->buf->ptr[node->buf->size - 1] == ':';
|
||||
|
||||
if (left && right)
|
||||
@@ -432,10 +463,8 @@ static cmark_node *try_opening_table_header(cmark_syntax_extension *self,
|
||||
ntr->is_header = true;
|
||||
|
||||
{
|
||||
cmark_llist *tmp;
|
||||
|
||||
for (tmp = header_row->cells; tmp; tmp = tmp->next) {
|
||||
node_cell *cell = (node_cell *) tmp->data;
|
||||
for (i = 0; i < header_row->n_columns; ++i) {
|
||||
node_cell *cell = &header_row->cells[i];
|
||||
cmark_node *header_cell = cmark_parser_add_child(parser, table_header,
|
||||
CMARK_NODE_TABLE_CELL, parent_container->start_column + cell->start_offset);
|
||||
header_cell->start_line = header_cell->end_line = parent_container->start_line;
|
||||
@@ -487,11 +516,10 @@ static cmark_node *try_opening_table_row(cmark_syntax_extension *self,
|
||||
|
||||
if (parser->options & CMARK_OPT_TABLE_SPANS) {
|
||||
// Check the new row for rowspan markers and increment the rowspan of the cell it's merging with
|
||||
cmark_llist *tmp;
|
||||
int i;
|
||||
|
||||
for (tmp = row->cells, i = 0; tmp && i < table_columns; tmp = tmp->next, ++i) {
|
||||
node_cell *this_cell = (node_cell *)tmp->data;
|
||||
for (i = 0; i < row->n_columns && i < table_columns; ++i) {
|
||||
node_cell *this_cell = &row->cells[i];
|
||||
if (this_cell->cell_data->rowspan == 0) {
|
||||
// Rowspan marker. Scan up through previous rows and increment the spanning cell's rowspan
|
||||
cmark_node *check_row = table_row_block->prev;
|
||||
@@ -515,11 +543,10 @@ static cmark_node *try_opening_table_row(cmark_syntax_extension *self,
|
||||
}
|
||||
|
||||
{
|
||||
cmark_llist *tmp;
|
||||
int i;
|
||||
|
||||
for (tmp = row->cells, i = 0; tmp && i < table_columns; tmp = tmp->next, ++i) {
|
||||
node_cell *cell = (node_cell *) tmp->data;
|
||||
for (i = 0; i < row->n_columns && i < table_columns; ++i) {
|
||||
node_cell *cell = &row->cells[i];
|
||||
cmark_node *node = cmark_parser_add_child(parser, table_row_block,
|
||||
CMARK_NODE_TABLE_CELL, parent_container->start_column + cell->start_offset);
|
||||
node->internal_offset = cell->internal_offset;
|
||||
@@ -980,6 +1007,7 @@ static int escape(cmark_syntax_extension *self, cmark_node *node, int c) {
|
||||
cmark_syntax_extension *create_table_extension(void) {
|
||||
cmark_syntax_extension *self = cmark_syntax_extension_new("table");
|
||||
|
||||
cmark_register_node_flag(&CMARK_NODE__TABLE_VISITED);
|
||||
cmark_syntax_extension_set_match_block_func(self, matches);
|
||||
cmark_syntax_extension_set_open_block_func(self, try_opening_table_block);
|
||||
cmark_syntax_extension_set_get_type_string_func(self, get_type_string);
|
||||
|
||||
21
fuzz/CMakeLists.txt
Normal file
21
fuzz/CMakeLists.txt
Normal file
@@ -0,0 +1,21 @@
|
||||
include_directories(
|
||||
${PROJECT_BINARY_DIR}/extensions
|
||||
${PROJECT_BINARY_DIR}/src
|
||||
../extensions
|
||||
../src
|
||||
)
|
||||
|
||||
macro(fuzzer name)
|
||||
add_executable(${name} ${name}.c)
|
||||
set_target_properties(${name}
|
||||
PROPERTIES
|
||||
COMPILE_FLAGS "-fsanitize=fuzzer"
|
||||
LINK_FLAGS "-fsanitize=fuzzer")
|
||||
if(CMARK_SHARED)
|
||||
target_link_libraries(${name} libcmark-gfm-extensions libcmark-gfm)
|
||||
elseif(CMARK_STATIC)
|
||||
target_link_libraries(${name} libcmark-gfm-extensions_static libcmark-gfm_static)
|
||||
endif()
|
||||
endmacro()
|
||||
|
||||
fuzzer(fuzz_quadratic)
|
||||
12
fuzz/README.md
Normal file
12
fuzz/README.md
Normal file
@@ -0,0 +1,12 @@
|
||||
The quadratic fuzzer generates long sequences of repeated characters, such as `<?x<?x<?x<?x...`,
|
||||
to detect quadratic complexity performance issues.
|
||||
|
||||
To build and run the quadratic fuzzer:
|
||||
|
||||
```bash
|
||||
mkdir build-fuzz
|
||||
cd build-fuzz
|
||||
cmake -DCMARK_FUZZ_QUADRATIC=ON -DCMAKE_C_COMPILER=$(which clang) -DCMAKE_CXX_COMPILER=$(which clang++) -DCMAKE_BUILD_TYPE=Release ..
|
||||
make
|
||||
../fuzz/fuzzloop.sh
|
||||
```
|
||||
87
fuzz/fuzz_quadratic.c
Normal file
87
fuzz/fuzz_quadratic.c
Normal file
@@ -0,0 +1,87 @@
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include "cmark-gfm.h"
|
||||
#include "cmark-gfm-core-extensions.h"
|
||||
#include <sys/types.h>
|
||||
#include <sys/stat.h>
|
||||
#include <fcntl.h>
|
||||
#include <unistd.h>
|
||||
|
||||
const char *extension_names[] = {
|
||||
"autolink",
|
||||
"strikethrough",
|
||||
"table",
|
||||
"tagfilter",
|
||||
NULL,
|
||||
};
|
||||
|
||||
int LLVMFuzzerInitialize(int *argc, char ***argv) {
|
||||
cmark_init_standard_node_flags();
|
||||
cmark_gfm_core_extensions_ensure_registered();
|
||||
return 0;
|
||||
}
|
||||
|
||||
int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) {
|
||||
struct __attribute__((packed)) {
|
||||
int options;
|
||||
int width;
|
||||
uint8_t splitpoint;
|
||||
uint8_t repeatlen;
|
||||
} fuzz_config;
|
||||
|
||||
if (size >= sizeof(fuzz_config)) {
|
||||
/* The beginning of `data` is treated as fuzzer configuration */
|
||||
memcpy(&fuzz_config, data, sizeof(fuzz_config));
|
||||
|
||||
/* Test options that are used by GitHub. */
|
||||
fuzz_config.options = CMARK_OPT_UNSAFE | CMARK_OPT_FOOTNOTES | CMARK_OPT_GITHUB_PRE_LANG | CMARK_OPT_HARDBREAKS;
|
||||
|
||||
/* Remainder of input is the markdown */
|
||||
const char *markdown0 = (const char *)(data + sizeof(fuzz_config));
|
||||
const size_t markdown_size0 = size - sizeof(fuzz_config);
|
||||
char markdown[0x80000];
|
||||
if (markdown_size0 <= sizeof(markdown)) {
|
||||
size_t markdown_size = 0;
|
||||
if (fuzz_config.splitpoint <= markdown_size0 && 0 < fuzz_config.repeatlen &&
|
||||
fuzz_config.repeatlen <= markdown_size0 - fuzz_config.splitpoint) {
|
||||
const size_t size_after_splitpoint = markdown_size0 - fuzz_config.splitpoint - fuzz_config.repeatlen;
|
||||
memcpy(&markdown[markdown_size], &markdown0[0], fuzz_config.splitpoint);
|
||||
markdown_size += fuzz_config.splitpoint;
|
||||
|
||||
while (markdown_size + fuzz_config.repeatlen + size_after_splitpoint <= sizeof(markdown)) {
|
||||
memcpy(&markdown[markdown_size], &markdown0[fuzz_config.splitpoint],
|
||||
fuzz_config.repeatlen);
|
||||
markdown_size += fuzz_config.repeatlen;
|
||||
}
|
||||
memcpy(&markdown[markdown_size], &markdown0[fuzz_config.splitpoint + fuzz_config.repeatlen],
|
||||
size_after_splitpoint);
|
||||
markdown_size += size_after_splitpoint;
|
||||
} else {
|
||||
markdown_size = markdown_size0;
|
||||
memcpy(markdown, markdown0, markdown_size);
|
||||
}
|
||||
|
||||
cmark_parser *parser = cmark_parser_new(fuzz_config.options);
|
||||
|
||||
for (const char **it = extension_names; *it; ++it) {
|
||||
const char *extension_name = *it;
|
||||
cmark_syntax_extension *syntax_extension = cmark_find_syntax_extension(extension_name);
|
||||
if (!syntax_extension) {
|
||||
fprintf(stderr, "%s is not a valid syntax extension\n", extension_name);
|
||||
abort();
|
||||
}
|
||||
cmark_parser_attach_syntax_extension(parser, syntax_extension);
|
||||
}
|
||||
|
||||
cmark_parser_feed(parser, markdown, markdown_size);
|
||||
cmark_node *doc = cmark_parser_finish(parser);
|
||||
|
||||
free(cmark_render_html(doc, fuzz_config.options, NULL));
|
||||
|
||||
cmark_node_free(doc);
|
||||
cmark_parser_free(parser);
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
28
fuzz/fuzzloop.sh
Executable file
28
fuzz/fuzzloop.sh
Executable file
@@ -0,0 +1,28 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Stop when an error is found
|
||||
set -e
|
||||
|
||||
# Create a corpus sub-directory if it doesn't already exist.
|
||||
mkdir -p corpus
|
||||
|
||||
# The memory and disk usage grows over time, so this loop restarts the
|
||||
# fuzzer every 4 hours. The `-merge=1` option is used to minimize the
|
||||
# corpus on each iteration.
|
||||
while :
|
||||
do
|
||||
date
|
||||
echo restarting loop
|
||||
|
||||
# Minimize the corpus
|
||||
mv corpus/ corpus2
|
||||
mkdir corpus
|
||||
echo minimizing corpus
|
||||
./fuzz/fuzz_quadratic -merge=1 corpus ../bench corpus2/ -max_len=1024
|
||||
rm -r corpus2
|
||||
|
||||
# Run the fuzzer for 4 hours
|
||||
date
|
||||
echo start fuzzer
|
||||
./fuzz/fuzz_quadratic corpus -dict=../test/fuzzing_dictionary -jobs=$(nproc) -workers=$(nproc) -max_len=1024 -max_total_time=14400
|
||||
done
|
||||
18
src/arena.c
18
src/arena.c
@@ -84,19 +84,17 @@ static void *arena_calloc(size_t nmem, size_t size) {
|
||||
|
||||
CMARK_INITIALIZE_AND_LOCK(arena);
|
||||
|
||||
void *ptr = NULL;
|
||||
|
||||
struct arena_chunk *chunk;
|
||||
if (sz > A->sz) {
|
||||
A->prev = alloc_arena_chunk(sz, A->prev);
|
||||
ptr = (uint8_t *) A->prev->ptr;
|
||||
A->prev = chunk = alloc_arena_chunk(sz, A->prev);
|
||||
} else if (sz > A->sz - A->used) {
|
||||
A = chunk = alloc_arena_chunk(A->sz + A->sz / 2, A);
|
||||
} else {
|
||||
if (sz > A->sz - A->used) {
|
||||
A = alloc_arena_chunk(A->sz + A->sz / 2, A);
|
||||
}
|
||||
ptr = (uint8_t *) A->ptr + A->used;
|
||||
A->used += sz;
|
||||
*((size_t *) ptr) = sz - sizeof(size_t);
|
||||
chunk = A;
|
||||
}
|
||||
void *ptr = (uint8_t *) chunk->ptr + chunk->used;
|
||||
chunk->used += sz;
|
||||
*((size_t *) ptr) = sz - sizeof(size_t);
|
||||
|
||||
CMARK_UNLOCK(arena);
|
||||
|
||||
|
||||
14
src/blocks.c
14
src/blocks.c
@@ -8,6 +8,7 @@
|
||||
#include <stdlib.h>
|
||||
#include <assert.h>
|
||||
#include <stdio.h>
|
||||
#include <limits.h>
|
||||
|
||||
#include "cmark_ctype.h"
|
||||
#include "syntax_extension.h"
|
||||
@@ -665,6 +666,14 @@ static cmark_node *finalize_document(cmark_parser *parser) {
|
||||
}
|
||||
|
||||
finalize(parser, parser->root);
|
||||
|
||||
// Limit total size of extra content created from reference links to
|
||||
// document size to avoid superlinear growth. Always allow 100KB.
|
||||
if (parser->total_size > 100000)
|
||||
parser->refmap->max_ref_size = parser->total_size;
|
||||
else
|
||||
parser->refmap->max_ref_size = 100000;
|
||||
|
||||
process_inlines(parser, parser->refmap, parser->options);
|
||||
if (parser->options & CMARK_OPT_FOOTNOTES)
|
||||
process_footnotes(parser);
|
||||
@@ -725,6 +734,11 @@ static void S_parser_feed(cmark_parser *parser, const unsigned char *buffer,
|
||||
static const uint8_t repl[] = {239, 191, 189};
|
||||
bool preserveWhitespace = parser->options & CMARK_OPT_PRESERVE_WHITESPACE;
|
||||
|
||||
if (len > UINT_MAX - parser->total_size)
|
||||
parser->total_size = UINT_MAX;
|
||||
else
|
||||
parser->total_size += len;
|
||||
|
||||
if (parser->last_buffer_ended_with_cr && *buffer == '\n') {
|
||||
// skip NL if last buffer ended with CR ; see #117
|
||||
buffer++;
|
||||
|
||||
@@ -114,6 +114,7 @@ typedef struct delimiter {
|
||||
struct delimiter *previous;
|
||||
struct delimiter *next;
|
||||
cmark_node *inl_text;
|
||||
bufsize_t position;
|
||||
bufsize_t length;
|
||||
unsigned char delim_char;
|
||||
int can_open;
|
||||
|
||||
@@ -10,7 +10,8 @@ extern "C" {
|
||||
struct cmark_map_entry {
|
||||
struct cmark_map_entry *next;
|
||||
unsigned char *label;
|
||||
unsigned int age;
|
||||
size_t age;
|
||||
size_t size;
|
||||
};
|
||||
|
||||
typedef struct cmark_map_entry cmark_map_entry;
|
||||
@@ -23,7 +24,9 @@ struct cmark_map {
|
||||
cmark_mem *mem;
|
||||
cmark_map_entry *refs;
|
||||
cmark_map_entry **sorted;
|
||||
unsigned int size;
|
||||
size_t size;
|
||||
size_t ref_size;
|
||||
size_t max_ref_size;
|
||||
cmark_map_free_f free;
|
||||
};
|
||||
|
||||
|
||||
@@ -52,11 +52,7 @@ typedef struct {
|
||||
cmark_chunk on_exit;
|
||||
} cmark_custom;
|
||||
|
||||
enum cmark_node__internal_flags {
|
||||
CMARK_NODE__OPEN = (1 << 0),
|
||||
CMARK_NODE__LAST_LINE_BLANK = (1 << 1),
|
||||
CMARK_NODE__LAST_LINE_CHECKED = (1 << 2),
|
||||
};
|
||||
typedef uint16_t cmark_node__internal_flags;
|
||||
|
||||
struct cmark_node {
|
||||
cmark_strbuf content;
|
||||
@@ -76,7 +72,7 @@ struct cmark_node {
|
||||
int end_column;
|
||||
int internal_offset;
|
||||
uint16_t type;
|
||||
uint16_t flags;
|
||||
cmark_node__internal_flags flags;
|
||||
int backtick_count;
|
||||
|
||||
cmark_syntax_extension *extension;
|
||||
@@ -101,6 +97,30 @@ struct cmark_node {
|
||||
} as;
|
||||
};
|
||||
|
||||
/**
|
||||
* Syntax extensions can use this function to register a custom node
|
||||
* flag. The flags are stored in the `flags` field of the `cmark_node`
|
||||
* struct. The `flags` parameter should be the address of a global variable
|
||||
* which will store the flag value.
|
||||
*/
|
||||
CMARK_GFM_EXPORT
|
||||
void cmark_register_node_flag(cmark_node__internal_flags *flags);
|
||||
|
||||
/**
|
||||
* Standard node flags. (Initialized using `cmark_init_standard_node_flags`.)
|
||||
*/
|
||||
extern cmark_node__internal_flags CMARK_NODE__OPEN;
|
||||
extern cmark_node__internal_flags CMARK_NODE__LAST_LINE_BLANK;
|
||||
extern cmark_node__internal_flags CMARK_NODE__LAST_LINE_CHECKED;
|
||||
|
||||
/**
|
||||
* Uses `cmark_register_node_flag` to initialize the standard node flags.
|
||||
* This function should be called at program startup time. Calling it
|
||||
* multiple times has no additional effect.
|
||||
*/
|
||||
CMARK_GFM_EXPORT
|
||||
void cmark_init_standard_node_flags();
|
||||
|
||||
static CMARK_INLINE cmark_mem *cmark_node_mem(cmark_node *node) {
|
||||
return node->content.mem;
|
||||
}
|
||||
|
||||
@@ -47,6 +47,7 @@ struct cmark_parser {
|
||||
/* Options set by the user, see the Options section in cmark.h */
|
||||
int options;
|
||||
bool last_buffer_ended_with_cr;
|
||||
size_t total_size;
|
||||
cmark_llist *syntax_extensions;
|
||||
cmark_llist *inline_syntax_extensions;
|
||||
cmark_ispunct_func backslash_ispunct;
|
||||
|
||||
@@ -15,6 +15,10 @@ bufsize_t _scan_autolink_uri(const unsigned char *p);
|
||||
bufsize_t _scan_autolink_email(const unsigned char *p);
|
||||
bufsize_t _scan_html_tag(const unsigned char *p);
|
||||
bufsize_t _scan_liberal_html_tag(const unsigned char *p);
|
||||
bufsize_t _scan_html_comment(const unsigned char *p);
|
||||
bufsize_t _scan_html_pi(const unsigned char *p);
|
||||
bufsize_t _scan_html_declaration(const unsigned char *p);
|
||||
bufsize_t _scan_html_cdata(const unsigned char *p);
|
||||
bufsize_t _scan_html_block_start(const unsigned char *p);
|
||||
bufsize_t _scan_html_block_start_7(const unsigned char *p);
|
||||
bufsize_t _scan_html_block_end_1(const unsigned char *p);
|
||||
@@ -37,6 +41,10 @@ bufsize_t _scan_footnote_definition(const unsigned char *p);
|
||||
#define scan_autolink_email(c, n) _scan_at(&_scan_autolink_email, c, n)
|
||||
#define scan_html_tag(c, n) _scan_at(&_scan_html_tag, c, n)
|
||||
#define scan_liberal_html_tag(c, n) _scan_at(&_scan_liberal_html_tag, c, n)
|
||||
#define scan_html_comment(c, n) _scan_at(&_scan_html_comment, c, n)
|
||||
#define scan_html_pi(c, n) _scan_at(&_scan_html_pi, c, n)
|
||||
#define scan_html_declaration(c, n) _scan_at(&_scan_html_declaration, c, n)
|
||||
#define scan_html_cdata(c, n) _scan_at(&_scan_html_cdata, c, n)
|
||||
#define scan_html_block_start(c, n) _scan_at(&_scan_html_block_start, c, n)
|
||||
#define scan_html_block_start_7(c, n) _scan_at(&_scan_html_block_start_7, c, n)
|
||||
#define scan_html_block_end_1(c, n) _scan_at(&_scan_html_block_end_1, c, n)
|
||||
|
||||
192
src/inlines.c
192
src/inlines.c
@@ -41,7 +41,6 @@ typedef enum {
|
||||
|
||||
typedef struct bracket {
|
||||
struct bracket *previous;
|
||||
struct delimiter *previous_delimiter;
|
||||
cmark_node *inl_text;
|
||||
bufsize_t position;
|
||||
bracket_type type;
|
||||
@@ -50,9 +49,15 @@ typedef struct bracket {
|
||||
bool in_bracket[4];
|
||||
} bracket;
|
||||
|
||||
#define FLAG_SKIP_HTML_CDATA (1u << 0)
|
||||
#define FLAG_SKIP_HTML_DECLARATION (1u << 1)
|
||||
#define FLAG_SKIP_HTML_PI (1u << 2)
|
||||
#define FLAG_SKIP_HTML_COMMENT (1u << 3)
|
||||
|
||||
typedef struct subject{
|
||||
cmark_mem *mem;
|
||||
cmark_chunk input;
|
||||
unsigned flags;
|
||||
int line;
|
||||
bufsize_t pos;
|
||||
int block_offset;
|
||||
@@ -62,6 +67,7 @@ typedef struct subject{
|
||||
bracket *last_bracket;
|
||||
bufsize_t backticks[MAXBACKTICKS + 1];
|
||||
bool scanned_for_backticks;
|
||||
bool no_link_openers;
|
||||
} subject;
|
||||
|
||||
void cmark_set_default_skip_chars(int8_t **skip_chars, bool use_memcpy) {
|
||||
@@ -122,6 +128,24 @@ static cmark_node *make_str_with_entities(subject *subj,
|
||||
}
|
||||
}
|
||||
|
||||
// Like cmark_node_append_child but without costly sanity checks.
|
||||
// Assumes that child was newly created.
|
||||
static void append_child(cmark_node *node, cmark_node *child) {
|
||||
cmark_node *old_last_child = node->last_child;
|
||||
|
||||
child->next = NULL;
|
||||
child->prev = old_last_child;
|
||||
child->parent = node;
|
||||
node->last_child = child;
|
||||
|
||||
if (old_last_child) {
|
||||
old_last_child->next = child;
|
||||
} else {
|
||||
// Also set first_child if node previously had no children.
|
||||
node->first_child = child;
|
||||
}
|
||||
}
|
||||
|
||||
// Duplicate a chunk by creating a copy of the buffer not by reusing the
|
||||
// buffer like cmark_chunk_dup does.
|
||||
static cmark_chunk chunk_clone(cmark_mem *mem, cmark_chunk *src) {
|
||||
@@ -165,7 +189,7 @@ static CMARK_INLINE cmark_node *make_autolink(subject *subj,
|
||||
link->start_line = link->end_line = subj->line;
|
||||
link->start_column = start_column + 1 + subj->column_offset + subj->block_offset;
|
||||
link->end_column = end_column + 1 + subj->column_offset + subj->block_offset;
|
||||
cmark_node_append_child(link, make_str_with_entities(subj, start_column + 1, end_column - 1, &url));
|
||||
append_child(link, make_str_with_entities(subj, start_column + 1, end_column - 1, &url));
|
||||
return link;
|
||||
}
|
||||
|
||||
@@ -174,6 +198,7 @@ static void subject_from_buf(cmark_mem *mem, int line_number, int block_offset,
|
||||
int i;
|
||||
e->mem = mem;
|
||||
e->input = *chunk;
|
||||
e->flags = 0;
|
||||
e->line = line_number;
|
||||
e->pos = 0;
|
||||
e->block_offset = block_offset;
|
||||
@@ -185,6 +210,7 @@ static void subject_from_buf(cmark_mem *mem, int line_number, int block_offset,
|
||||
e->backticks[i] = 0;
|
||||
}
|
||||
e->scanned_for_backticks = false;
|
||||
e->no_link_openers = true;
|
||||
}
|
||||
|
||||
static CMARK_INLINE int isbacktick(int c) { return (c == '`'); }
|
||||
@@ -520,6 +546,7 @@ static void push_delimiter(subject *subj, unsigned char c, bool can_open,
|
||||
delim->can_open = can_open;
|
||||
delim->can_close = can_close;
|
||||
delim->inl_text = inl_text;
|
||||
delim->position = subj->pos;
|
||||
delim->length = inl_text->as.literal.len;
|
||||
delim->previous = subj->last_delim;
|
||||
delim->next = NULL;
|
||||
@@ -539,11 +566,13 @@ static void push_bracket(subject *subj, bracket_type type, cmark_node *inl_text)
|
||||
b->active = true;
|
||||
b->inl_text = inl_text;
|
||||
b->previous = subj->last_bracket;
|
||||
b->previous_delimiter = subj->last_delim;
|
||||
b->position = subj->pos;
|
||||
b->bracket_after = false;
|
||||
b->in_bracket[type] = true;
|
||||
subj->last_bracket = b;
|
||||
if (type != IMAGE) {
|
||||
subj->no_link_openers = false;
|
||||
}
|
||||
}
|
||||
|
||||
// Assumes the subject has a c at the current position.
|
||||
@@ -650,12 +679,13 @@ static cmark_syntax_extension *get_extension_for_special_char(cmark_parser *pars
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static void process_emphasis(cmark_parser *parser, subject *subj, delimiter *stack_bottom) {
|
||||
delimiter *closer = subj->last_delim;
|
||||
static void process_emphasis(cmark_parser *parser, subject *subj, bufsize_t stack_bottom) {
|
||||
delimiter *candidate;
|
||||
delimiter *closer = NULL;
|
||||
delimiter *opener;
|
||||
delimiter *old_closer;
|
||||
bool opener_found;
|
||||
delimiter *openers_bottom[3][128];
|
||||
bufsize_t openers_bottom[3][128];
|
||||
int i;
|
||||
|
||||
// initialize openers_bottom:
|
||||
@@ -668,8 +698,10 @@ static void process_emphasis(cmark_parser *parser, subject *subj, delimiter *sta
|
||||
}
|
||||
|
||||
// move back to first relevant delim.
|
||||
while (closer != NULL && closer->previous != stack_bottom) {
|
||||
closer = closer->previous;
|
||||
candidate = subj->last_delim;
|
||||
while (candidate != NULL && candidate->position >= stack_bottom) {
|
||||
closer = candidate;
|
||||
candidate = candidate->previous;
|
||||
}
|
||||
|
||||
// now move forward, looking for closers, and handling each
|
||||
@@ -679,8 +711,8 @@ static void process_emphasis(cmark_parser *parser, subject *subj, delimiter *sta
|
||||
// Now look backwards for first matching opener:
|
||||
opener = closer->previous;
|
||||
opener_found = false;
|
||||
while (opener != NULL && opener != stack_bottom &&
|
||||
opener != openers_bottom[closer->length % 3][closer->delim_char]) {
|
||||
while (opener != NULL && opener->position >= stack_bottom &&
|
||||
opener->position >= openers_bottom[closer->length % 3][closer->delim_char]) {
|
||||
if (opener->can_open && opener->delim_char == closer->delim_char) {
|
||||
// interior closer of size 2 can't match opener of size 1
|
||||
// or of size 1 can't match 2
|
||||
@@ -706,27 +738,29 @@ static void process_emphasis(cmark_parser *parser, subject *subj, delimiter *sta
|
||||
} else {
|
||||
closer = closer->next;
|
||||
}
|
||||
} else if (closer->delim_char == '\'') {
|
||||
} else if (closer->delim_char == '\'' || closer->delim_char == '"') {
|
||||
cmark_chunk_free(subj->mem, &closer->inl_text->as.literal);
|
||||
closer->inl_text->as.literal = cmark_chunk_literal(RIGHTSINGLEQUOTE);
|
||||
if (opener_found) {
|
||||
cmark_chunk_free(subj->mem, &opener->inl_text->as.literal);
|
||||
opener->inl_text->as.literal = cmark_chunk_literal(LEFTSINGLEQUOTE);
|
||||
if (closer->delim_char == '\'') {
|
||||
closer->inl_text->as.literal = cmark_chunk_literal(RIGHTSINGLEQUOTE);
|
||||
} else {
|
||||
closer->inl_text->as.literal = cmark_chunk_literal(RIGHTDOUBLEQUOTE);
|
||||
}
|
||||
closer = closer->next;
|
||||
} else if (closer->delim_char == '"') {
|
||||
cmark_chunk_free(subj->mem, &closer->inl_text->as.literal);
|
||||
closer->inl_text->as.literal = cmark_chunk_literal(RIGHTDOUBLEQUOTE);
|
||||
if (opener_found) {
|
||||
cmark_chunk_free(subj->mem, &opener->inl_text->as.literal);
|
||||
opener->inl_text->as.literal = cmark_chunk_literal(LEFTDOUBLEQUOTE);
|
||||
if (old_closer->delim_char == '\'') {
|
||||
opener->inl_text->as.literal = cmark_chunk_literal(LEFTSINGLEQUOTE);
|
||||
} else {
|
||||
opener->inl_text->as.literal = cmark_chunk_literal(LEFTDOUBLEQUOTE);
|
||||
}
|
||||
remove_delimiter(subj, opener);
|
||||
remove_delimiter(subj, old_closer);
|
||||
}
|
||||
closer = closer->next;
|
||||
}
|
||||
if (!opener_found) {
|
||||
// set lower bound for future searches for openers
|
||||
openers_bottom[old_closer->length % 3][old_closer->delim_char] =
|
||||
old_closer->previous;
|
||||
old_closer->position;
|
||||
if (!old_closer->can_open) {
|
||||
// we can remove a closer that can't be an
|
||||
// opener, once we've seen there's no
|
||||
@@ -739,7 +773,8 @@ static void process_emphasis(cmark_parser *parser, subject *subj, delimiter *sta
|
||||
}
|
||||
}
|
||||
// free all delimiters in list until stack_bottom:
|
||||
while (subj->last_delim != NULL && subj->last_delim != stack_bottom) {
|
||||
while (subj->last_delim != NULL &&
|
||||
subj->last_delim->position >= stack_bottom) {
|
||||
remove_delimiter(subj, subj->last_delim);
|
||||
}
|
||||
}
|
||||
@@ -778,7 +813,8 @@ static delimiter *S_insert_emph(subject *subj, delimiter *opener,
|
||||
tmp = opener_inl->next;
|
||||
while (tmp && tmp != closer_inl) {
|
||||
tmpnext = tmp->next;
|
||||
cmark_node_append_child(emph, tmp);
|
||||
cmark_node_unlink(tmp);
|
||||
append_child(emph, tmp);
|
||||
tmp = tmpnext;
|
||||
}
|
||||
cmark_node_insert_after(opener_inl, emph);
|
||||
@@ -915,7 +951,63 @@ static cmark_node *handle_pointy_brace(subject *subj, int options) {
|
||||
}
|
||||
|
||||
// finally, try to match an html tag
|
||||
matchlen = scan_html_tag(&subj->input, subj->pos);
|
||||
if (subj->pos + 2 <= subj->input.len) {
|
||||
int c = subj->input.data[subj->pos];
|
||||
if (c == '!' && (subj->flags & FLAG_SKIP_HTML_COMMENT) == 0) {
|
||||
c = subj->input.data[subj->pos+1];
|
||||
if (c == '-' && subj->input.data[subj->pos+2] == '-') {
|
||||
if (subj->input.data[subj->pos+3] == '>') {
|
||||
matchlen = 4;
|
||||
} else if (subj->input.data[subj->pos+3] == '-' &&
|
||||
subj->input.data[subj->pos+4] == '>') {
|
||||
matchlen = 5;
|
||||
} else {
|
||||
matchlen = scan_html_comment(&subj->input, subj->pos + 1);
|
||||
if (matchlen > 0) {
|
||||
matchlen += 1; // prefix "<"
|
||||
} else { // no match through end of input: set a flag so
|
||||
// we don't reparse looking for -->:
|
||||
subj->flags |= FLAG_SKIP_HTML_COMMENT;
|
||||
}
|
||||
}
|
||||
} else if (c == '[') {
|
||||
if ((subj->flags & FLAG_SKIP_HTML_CDATA) == 0) {
|
||||
matchlen = scan_html_cdata(&subj->input, subj->pos + 2);
|
||||
if (matchlen > 0) {
|
||||
// The regex doesn't require the final "]]>". But if we're not at
|
||||
// the end of input, it must come after the match. Otherwise,
|
||||
// disable subsequent scans to avoid quadratic behavior.
|
||||
matchlen += 5; // prefix "![", suffix "]]>"
|
||||
if (subj->pos + matchlen > subj->input.len) {
|
||||
subj->flags |= FLAG_SKIP_HTML_CDATA;
|
||||
matchlen = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
} else if ((subj->flags & FLAG_SKIP_HTML_DECLARATION) == 0) {
|
||||
matchlen = scan_html_declaration(&subj->input, subj->pos + 1);
|
||||
if (matchlen > 0) {
|
||||
matchlen += 2; // prefix "!", suffix ">"
|
||||
if (subj->pos + matchlen > subj->input.len) {
|
||||
subj->flags |= FLAG_SKIP_HTML_DECLARATION;
|
||||
matchlen = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
} else if (c == '?') {
|
||||
if ((subj->flags & FLAG_SKIP_HTML_PI) == 0) {
|
||||
// Note that we allow an empty match.
|
||||
matchlen = scan_html_pi(&subj->input, subj->pos + 1);
|
||||
matchlen += 3; // prefix "?", suffix "?>"
|
||||
if (subj->pos + matchlen > subj->input.len) {
|
||||
subj->flags |= FLAG_SKIP_HTML_PI;
|
||||
matchlen = 0;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
matchlen = scan_html_tag(&subj->input, subj->pos);
|
||||
}
|
||||
}
|
||||
if (matchlen > 0) {
|
||||
contents = cmark_chunk_dup(&subj->input, subj->pos - 1, matchlen + 1);
|
||||
subj->pos += matchlen;
|
||||
@@ -1170,7 +1262,7 @@ static cmark_node *handle_close_bracket_attribute(cmark_parser *parser, subject
|
||||
// Free the bracket ^[:
|
||||
cmark_node_free(opener->inl_text);
|
||||
|
||||
process_emphasis(parser, subj, opener->previous_delimiter);
|
||||
process_emphasis(parser, subj, opener->position);
|
||||
pop_bracket(subj);
|
||||
|
||||
return NULL;
|
||||
@@ -1201,12 +1293,6 @@ static cmark_node *handle_close_bracket(cmark_parser *parser, subject *subj) {
|
||||
return make_str(subj, subj->pos - 1, subj->pos - 1, cmark_chunk_literal("]"));
|
||||
}
|
||||
|
||||
if (!opener->active) {
|
||||
// take delimiter off stack
|
||||
pop_bracket(subj);
|
||||
return make_str(subj, subj->pos - 1, subj->pos - 1, cmark_chunk_literal("]"));
|
||||
}
|
||||
|
||||
if (opener->type == ATTRIBUTE) {
|
||||
return handle_close_bracket_attribute(parser, subj, opener);
|
||||
}
|
||||
@@ -1215,6 +1301,12 @@ static cmark_node *handle_close_bracket(cmark_parser *parser, subject *subj) {
|
||||
// Now we check to see if it's a link/image.
|
||||
is_image = opener->type == IMAGE;
|
||||
|
||||
if (!is_image && subj->no_link_openers) {
|
||||
// take delimiter off stack
|
||||
pop_bracket(subj);
|
||||
return make_str(subj, subj->pos - 1, subj->pos - 1, cmark_chunk_literal("]"));
|
||||
}
|
||||
|
||||
after_link_text_pos = subj->pos;
|
||||
|
||||
// First, look for an inline link.
|
||||
@@ -1333,7 +1425,7 @@ noMatch:
|
||||
// being replacing the opening '[' text node with a `^footnote-ref]` node.
|
||||
cmark_node_insert_before(opener->inl_text, fnref);
|
||||
|
||||
process_emphasis(parser, subj, opener->previous_delimiter);
|
||||
process_emphasis(parser, subj, opener->position);
|
||||
// sometimes, the footnote reference text gets parsed into multiple nodes
|
||||
// i.e. '[^example]' parsed into '[', '^exam', 'ple]'.
|
||||
// this happens for ex with the autolink extension. when the autolinker
|
||||
@@ -1379,42 +1471,22 @@ match:
|
||||
tmp = opener->inl_text->next;
|
||||
while (tmp) {
|
||||
tmpnext = tmp->next;
|
||||
cmark_node_append_child(inl, tmp);
|
||||
cmark_node_unlink(tmp);
|
||||
append_child(inl, tmp);
|
||||
tmp = tmpnext;
|
||||
}
|
||||
|
||||
// Free the bracket [:
|
||||
cmark_node_free(opener->inl_text);
|
||||
|
||||
process_emphasis(parser, subj, opener->previous_delimiter);
|
||||
process_emphasis(parser, subj, opener->position);
|
||||
pop_bracket(subj);
|
||||
|
||||
// Now, if we have a link, we also want to deactivate earlier link
|
||||
// delimiters. (This code can be removed if we decide to allow links
|
||||
// Now, if we have a link, we also want to deactivate links until
|
||||
// we get a new opener. (This code can be removed if we decide to allow links
|
||||
// inside links.)
|
||||
if (!is_image) {
|
||||
opener = subj->last_bracket;
|
||||
while (opener != NULL) {
|
||||
if (opener->type == LINK) {
|
||||
if (!opener->active) {
|
||||
break;
|
||||
} else {
|
||||
opener->active = false;
|
||||
}
|
||||
}
|
||||
opener = opener->previous;
|
||||
}
|
||||
bool in_image = false;
|
||||
if (opener) {
|
||||
in_image = opener->in_bracket[IMAGE];
|
||||
}
|
||||
bracket *opener2 = subj->last_bracket;
|
||||
while (opener2 != opener) {
|
||||
if (opener2->type == IMAGE) {
|
||||
opener2->in_bracket[IMAGE] = in_image;
|
||||
}
|
||||
opener2 = opener2->previous;
|
||||
}
|
||||
subj->no_link_openers = true;
|
||||
}
|
||||
|
||||
return NULL;
|
||||
@@ -1623,7 +1695,7 @@ static int parse_inline(cmark_parser *parser, subject *subj, cmark_node *parent,
|
||||
}
|
||||
|
||||
if (new_inl != NULL) {
|
||||
cmark_node_append_child(parent, new_inl);
|
||||
append_child(parent, new_inl);
|
||||
}
|
||||
|
||||
return 1;
|
||||
@@ -1643,7 +1715,7 @@ void cmark_parse_inlines(cmark_parser *parser,
|
||||
while (!is_eof(&subj) && parse_inline(parser, &subj, parent, options))
|
||||
;
|
||||
|
||||
process_emphasis(parser, &subj, NULL);
|
||||
process_emphasis(parser, &subj, 0);
|
||||
// free bracket and delim stack
|
||||
while (subj.last_delim) {
|
||||
remove_delimiter(&subj, subj.last_delim);
|
||||
|
||||
15
src/map.c
15
src/map.c
@@ -51,7 +51,7 @@ refsearch(const void *label, const void *p2) {
|
||||
}
|
||||
|
||||
static void sort_map(cmark_map *map) {
|
||||
unsigned int i = 0, last = 0, size = map->size;
|
||||
size_t i = 0, last = 0, size = map->size;
|
||||
cmark_map_entry *r = map->refs, **sorted = NULL;
|
||||
|
||||
sorted = (cmark_map_entry **)map->mem->calloc(size, sizeof(cmark_map_entry *));
|
||||
@@ -73,6 +73,7 @@ static void sort_map(cmark_map *map) {
|
||||
|
||||
cmark_map_entry *cmark_map_lookup(cmark_map *map, cmark_chunk *label) {
|
||||
cmark_map_entry **ref = NULL;
|
||||
cmark_map_entry *r = NULL;
|
||||
unsigned char *norm;
|
||||
|
||||
if (label->len < 1 || label->len > MAX_LINK_LABEL_LENGTH)
|
||||
@@ -91,10 +92,15 @@ cmark_map_entry *cmark_map_lookup(cmark_map *map, cmark_chunk *label) {
|
||||
ref = (cmark_map_entry **)bsearch(norm, map->sorted, map->size, sizeof(cmark_map_entry *), refsearch);
|
||||
map->mem->free(norm);
|
||||
|
||||
if (!ref)
|
||||
return NULL;
|
||||
if (ref != NULL) {
|
||||
r = ref[0];
|
||||
/* Check for expansion limit */
|
||||
if (r->size > map->max_ref_size - map->ref_size)
|
||||
return NULL;
|
||||
map->ref_size += r->size;
|
||||
}
|
||||
|
||||
return ref[0];
|
||||
return r;
|
||||
}
|
||||
|
||||
void cmark_map_free(cmark_map *map) {
|
||||
@@ -118,5 +124,6 @@ cmark_map *cmark_map_new(cmark_mem *mem, cmark_map_free_f free) {
|
||||
cmark_map *map = (cmark_map *)mem->calloc(1, sizeof(cmark_map));
|
||||
map->mem = mem;
|
||||
map->free = free;
|
||||
map->max_ref_size = UINT_MAX;
|
||||
return map;
|
||||
}
|
||||
|
||||
34
src/node.c
34
src/node.c
@@ -9,6 +9,40 @@ static void S_node_unlink(cmark_node *node);
|
||||
|
||||
#define NODE_MEM(node) cmark_node_mem(node)
|
||||
|
||||
cmark_node__internal_flags CMARK_NODE__OPEN;
|
||||
cmark_node__internal_flags CMARK_NODE__LAST_LINE_BLANK;
|
||||
cmark_node__internal_flags CMARK_NODE__LAST_LINE_CHECKED;
|
||||
|
||||
void cmark_register_node_flag(cmark_node__internal_flags *flags) {
|
||||
static uint8_t shift = 0;
|
||||
|
||||
// flags should be a pointer to a global variable and this function
|
||||
// should only be called once to initialize its value.
|
||||
if (*flags) {
|
||||
fprintf(stderr, "flag initialization error in cmark_register_node_flag\n");
|
||||
abort();
|
||||
}
|
||||
|
||||
// Check that we haven't run out of bits.
|
||||
if (shift >= 8 * sizeof(cmark_node__internal_flags)) {
|
||||
fprintf(stderr, "too many flags in cmark_register_node_flag\n");
|
||||
abort();
|
||||
}
|
||||
|
||||
*flags = (cmark_node__internal_flags)1 << shift;
|
||||
shift++;
|
||||
}
|
||||
|
||||
void cmark_init_standard_node_flags() {
|
||||
static int initialized = 0;
|
||||
if (!initialized) {
|
||||
initialized = 1;
|
||||
cmark_register_node_flag(&CMARK_NODE__OPEN);
|
||||
cmark_register_node_flag(&CMARK_NODE__LAST_LINE_BLANK);
|
||||
cmark_register_node_flag(&CMARK_NODE__LAST_LINE_CHECKED);
|
||||
}
|
||||
}
|
||||
|
||||
bool cmark_node_can_contain_type(cmark_node *node, cmark_node_type child_type) {
|
||||
if (child_type == CMARK_NODE_DOCUMENT) {
|
||||
return false;
|
||||
|
||||
@@ -35,6 +35,7 @@ void cmark_reference_create(cmark_map *map, cmark_chunk *label,
|
||||
ref->attributes = cmark_chunk_literal("");
|
||||
ref->entry.age = map->size;
|
||||
ref->entry.next = map->refs;
|
||||
ref->entry.size = ref->url.len + ref->title.len;
|
||||
|
||||
map->refs = (cmark_map_entry *)ref;
|
||||
map->size++;
|
||||
|
||||
24292
src/scanners.c
24292
src/scanners.c
File diff suppressed because it is too large
Load Diff
@@ -37,7 +37,7 @@ bufsize_t _scan_at(bufsize_t (*scanner)(const unsigned char *), cmark_chunk *c,
|
||||
|
||||
tagname = [A-Za-z][A-Za-z0-9-]*;
|
||||
|
||||
blocktagname = 'address'|'article'|'aside'|'base'|'basefont'|'blockquote'|'body'|'caption'|'center'|'col'|'colgroup'|'dd'|'details'|'dialog'|'dir'|'div'|'dl'|'dt'|'fieldset'|'figcaption'|'figure'|'footer'|'form'|'frame'|'frameset'|'h1'|'h2'|'h3'|'h4'|'h5'|'h6'|'head'|'header'|'hr'|'html'|'iframe'|'legend'|'li'|'link'|'main'|'menu'|'menuitem'|'nav'|'noframes'|'ol'|'optgroup'|'option'|'p'|'param'|'section'|'title'|'summary'|'table'|'tbody'|'td'|'tfoot'|'th'|'thead'|'title'|'tr'|'track'|'ul';
|
||||
blocktagname = 'address'|'article'|'aside'|'base'|'basefont'|'blockquote'|'body'|'caption'|'center'|'col'|'colgroup'|'dd'|'details'|'dialog'|'dir'|'div'|'dl'|'dt'|'fieldset'|'figcaption'|'figure'|'footer'|'form'|'frame'|'frameset'|'h1'|'h2'|'h3'|'h4'|'h5'|'h6'|'head'|'header'|'hr'|'html'|'iframe'|'legend'|'li'|'link'|'main'|'menu'|'menuitem'|'nav'|'noframes'|'ol'|'optgroup'|'option'|'p'|'param'|'section'|'source'|'title'|'summary'|'table'|'tbody'|'td'|'tfoot'|'th'|'thead'|'title'|'tr'|'track'|'ul';
|
||||
|
||||
attributename = [a-zA-Z_:][a-zA-Z0-9:._-]*;
|
||||
|
||||
@@ -54,16 +54,15 @@ bufsize_t _scan_at(bufsize_t (*scanner)(const unsigned char *), cmark_chunk *c,
|
||||
opentag = tagname attribute* spacechar* [/]? [>];
|
||||
closetag = [/] tagname spacechar* [>];
|
||||
|
||||
htmlcomment = "!---->" | ("!--" ([-]? [^\x00>-]) ([-]? [^\x00-])* "-->");
|
||||
htmlcomment = "--" ([^\x00-]+ | "-" [^\x00-] | "--" [^\x00>])* "-->";
|
||||
|
||||
processinginstruction = "?" ([^?>\x00]+ | [?][^>\x00] | [>])* "?>";
|
||||
processinginstruction = ([^?>\x00]+ | [?][^>\x00] | [>])+;
|
||||
|
||||
declaration = "!" [A-Z]+ spacechar+ [^>\x00]* ">";
|
||||
declaration = [A-Z]+ spacechar+ [^>\x00]*;
|
||||
|
||||
cdata = "![CDATA[" ([^\]\x00]+ | "]" [^\]\x00] | "]]" [^>\x00])* "]]>";
|
||||
cdata = "CDATA[" ([^\]\x00]+ | "]" [^\]\x00] | "]]" [^>\x00])*;
|
||||
|
||||
htmltag = opentag | closetag | htmlcomment | processinginstruction |
|
||||
declaration | cdata;
|
||||
htmltag = opentag | closetag;
|
||||
|
||||
in_parens_nosp = [(] (reg_char|escaped_char|[\\])* [)];
|
||||
|
||||
@@ -133,6 +132,46 @@ bufsize_t _scan_liberal_html_tag(const unsigned char *p)
|
||||
*/
|
||||
}
|
||||
|
||||
bufsize_t _scan_html_comment(const unsigned char *p)
|
||||
{
|
||||
const unsigned char *marker = NULL;
|
||||
const unsigned char *start = p;
|
||||
/*!re2c
|
||||
htmlcomment { return (bufsize_t)(p - start); }
|
||||
* { return 0; }
|
||||
*/
|
||||
}
|
||||
|
||||
bufsize_t _scan_html_pi(const unsigned char *p)
|
||||
{
|
||||
const unsigned char *marker = NULL;
|
||||
const unsigned char *start = p;
|
||||
/*!re2c
|
||||
processinginstruction { return (bufsize_t)(p - start); }
|
||||
* { return 0; }
|
||||
*/
|
||||
}
|
||||
|
||||
bufsize_t _scan_html_declaration(const unsigned char *p)
|
||||
{
|
||||
const unsigned char *marker = NULL;
|
||||
const unsigned char *start = p;
|
||||
/*!re2c
|
||||
declaration { return (bufsize_t)(p - start); }
|
||||
* { return 0; }
|
||||
*/
|
||||
}
|
||||
|
||||
bufsize_t _scan_html_cdata(const unsigned char *p)
|
||||
{
|
||||
const unsigned char *marker = NULL;
|
||||
const unsigned char *start = p;
|
||||
/*!re2c
|
||||
cdata { return (bufsize_t)(p - start); }
|
||||
* { return 0; }
|
||||
*/
|
||||
}
|
||||
|
||||
// Try to match an HTML block tag start line, returning
|
||||
// an integer code for the type of block (1-6, matching the spec).
|
||||
// #7 is handled by a separate function, below.
|
||||
@@ -140,7 +179,7 @@ bufsize_t _scan_html_block_start(const unsigned char *p)
|
||||
{
|
||||
const unsigned char *marker = NULL;
|
||||
/*!re2c
|
||||
[<] ('script'|'pre'|'style') (spacechar | [>]) { return 1; }
|
||||
[<] ('script'|'pre'|'textarea'|'style') (spacechar | [>]) { return 1; }
|
||||
'<!--' { return 2; }
|
||||
'<?' { return 3; }
|
||||
'<!' [A-Z] { return 4; }
|
||||
@@ -167,7 +206,7 @@ bufsize_t _scan_html_block_end_1(const unsigned char *p)
|
||||
const unsigned char *marker = NULL;
|
||||
const unsigned char *start = p;
|
||||
/*!re2c
|
||||
[^\n\x00]* [<] [/] ('script'|'pre'|'style') [>] { return (bufsize_t)(p - start); }
|
||||
[^\n\x00]* [<] [/] ('script'|'pre'|'textarea'|'style') [>] { return (bufsize_t)(p - start); }
|
||||
* { return 0; }
|
||||
*/
|
||||
}
|
||||
|
||||
@@ -13,6 +13,7 @@ def pipe_through_prog(prog, text):
|
||||
|
||||
def parse(lib, extlib, text, extensions):
|
||||
cmark_gfm_core_extensions_ensure_registered = extlib.cmark_gfm_core_extensions_ensure_registered
|
||||
cmark_init_standard_node_flags = lib.cmark_init_standard_node_flags
|
||||
|
||||
find_syntax_extension = lib.cmark_find_syntax_extension
|
||||
find_syntax_extension.restype = c_void_p
|
||||
@@ -32,6 +33,7 @@ def parse(lib, extlib, text, extensions):
|
||||
parser_finish.restype = c_void_p
|
||||
parser_finish.argtypes = [c_void_p]
|
||||
|
||||
cmark_init_standard_node_flags()
|
||||
cmark_gfm_core_extensions_ensure_registered()
|
||||
|
||||
parser = parser_new(0)
|
||||
|
||||
@@ -581,6 +581,12 @@ www.github.com www.github.com/á
|
||||
|
||||
www.google.com/a_b
|
||||
|
||||
Underscores not allowed in host name www.xxx.yyy._zzz
|
||||
|
||||
Underscores not allowed in host name www.xxx._yyy.zzz
|
||||
|
||||
Underscores allowed in domain name www._xxx.yyy.zzz
|
||||
|
||||
**Autolink and http://inlines**
|
||||
|
||||

|
||||
@@ -618,6 +624,9 @@ http://🍄.ga/ http://x🍄.ga/
|
||||
<p>Email me at:<a href="mailto:scyther@pokemon.com">scyther@pokemon.com</a></p>
|
||||
<p><a href="http://www.github.com">www.github.com</a> <a href="http://www.github.com/%C3%A1">www.github.com/á</a></p>
|
||||
<p><a href="http://www.google.com/a_b">www.google.com/a_b</a></p>
|
||||
<p>Underscores not allowed in host name www.xxx.yyy._zzz</p>
|
||||
<p>Underscores not allowed in host name www.xxx._yyy.zzz</p>
|
||||
<p>Underscores allowed in domain name <a href="http://www._xxx.yyy.zzz">www._xxx.yyy.zzz</a></p>
|
||||
<p><strong>Autolink and <a href="http://inlines">http://inlines</a></strong></p>
|
||||
<p><img src="http://inline.com/image" alt="http://inline.com/image" /></p>
|
||||
<p><a href="mailto:a.w@b.c">a.w@b.c</a></p>
|
||||
|
||||
@@ -63,6 +63,9 @@ pathological = {
|
||||
"pattern [ (]( repeated":
|
||||
(("[ (](" * 80000),
|
||||
re.compile("(\[ \(\]\(){80000}")),
|
||||
"pattern ![[]() repeated":
|
||||
("![[]()" * 160000,
|
||||
re.compile("(!\[<a href=\"\"></a>){160000}")),
|
||||
"hard link/emph case":
|
||||
("**x [a*b**c*](d)",
|
||||
re.compile("\\*\\*x <a href=\"d\">a<em>b\\*\\*c</em></a>")),
|
||||
@@ -87,6 +90,9 @@ pathological = {
|
||||
"unclosed links B":
|
||||
("[a](b" * 30000,
|
||||
re.compile("(\[a\]\(b){30000}")),
|
||||
"unclosed <!--":
|
||||
("</" + "<!--" * 300000,
|
||||
re.compile("\<\/(\<!--){300000}")),
|
||||
"tables":
|
||||
("aaa\rbbb\n-\v\n" * 30000,
|
||||
re.compile("^<p>aaa</p>\n<table>\n<thead>\n<tr>\n<th>bbb</th>\n</tr>\n</thead>\n<tbody>\n(<tr>\n<td>aaa</td>\n</tr>\n<tr>\n<td>bbb</td>\n</tr>\n<tr>\n<td>-\x0b</td>\n</tr>\n){29999}</tbody>\n</table>\n$")),
|
||||
|
||||
@@ -366,3 +366,11 @@ Hello world
|
||||
.
|
||||
<p>Hello world</p>
|
||||
````````````````````````````````
|
||||
|
||||
Issue #424 - emphasis before links
|
||||
|
||||
```````````````````````````````` example
|
||||
*text* [link](#section)
|
||||
.
|
||||
<p><em>text</em> <a href="#section">link</a></p>
|
||||
````````````````````````````````
|
||||
|
||||
@@ -130,7 +130,7 @@ questions it does not answer:
|
||||
not require that. This is hardly a "corner case," and divergences
|
||||
between implementations on this issue often lead to surprises for
|
||||
users in real documents. (See [this comment by John
|
||||
Gruber](http://article.gmane.org/gmane.text.markdown.general/1997).)
|
||||
Gruber](https://web.archive.org/web/20170611172104/http://article.gmane.org/gmane.text.markdown.general/1997).)
|
||||
|
||||
2. Is a blank line needed before a block quote or heading?
|
||||
Most implementations do not require the blank line. However,
|
||||
@@ -138,7 +138,7 @@ questions it does not answer:
|
||||
also to ambiguities in parsing (note that some implementations
|
||||
put the heading inside the blockquote, while others do not).
|
||||
(John Gruber has also spoken [in favor of requiring the blank
|
||||
lines](http://article.gmane.org/gmane.text.markdown.general/2146).)
|
||||
lines](https://web.archive.org/web/20170611172104/http://article.gmane.org/gmane.text.markdown.general/2146).)
|
||||
|
||||
3. Is a blank line needed before an indented code block?
|
||||
(`Markdown.pl` requires it, but this is not mentioned in the
|
||||
@@ -171,7 +171,7 @@ questions it does not answer:
|
||||
```
|
||||
|
||||
(There are some relevant comments by John Gruber
|
||||
[here](http://article.gmane.org/gmane.text.markdown.general/2554).)
|
||||
[here](https://web.archive.org/web/20170611172104/http://article.gmane.org/gmane.text.markdown.general/2554).)
|
||||
|
||||
5. Can list markers be indented? Can ordered list markers be right-aligned?
|
||||
|
||||
@@ -1001,10 +1001,7 @@ interpretable as a [code fence], [ATX heading][ATX headings],
|
||||
|
||||
A [setext heading underline](@) is a sequence of
|
||||
`=` characters or a sequence of `-` characters, with no more than 3
|
||||
spaces indentation and any number of trailing spaces. If a line
|
||||
containing a single `-` can be interpreted as an
|
||||
empty [list items], it should be interpreted this way
|
||||
and not as a [setext heading underline].
|
||||
spaces of indentation and any number of trailing spaces or tabs.
|
||||
|
||||
The heading is a level 1 heading if `=` characters are used in
|
||||
the [setext heading underline], and a level 2 heading if `-`
|
||||
@@ -1638,7 +1635,7 @@ has been found, the code block contains all of the lines after the
|
||||
opening code fence until the end of the containing block (or
|
||||
document). (An alternative spec would require backtracking in the
|
||||
event that a closing code fence is not found. But this makes parsing
|
||||
much less efficient, and there seems to be no real down side to the
|
||||
much less efficient, and there seems to be no real downside to the
|
||||
behavior described here.)
|
||||
|
||||
A fenced code block may interrupt a paragraph, and does not require
|
||||
@@ -2068,7 +2065,7 @@ followed by an uppercase ASCII letter.\
|
||||
`<![CDATA[`.\
|
||||
**End condition:** line contains the string `]]>`.
|
||||
|
||||
6. **Start condition:** line begins the string `<` or `</`
|
||||
6. **Start condition:** line begins with the string `<` or `</`
|
||||
followed by one of the strings (case-insensitive) `address`,
|
||||
`article`, `aside`, `base`, `basefont`, `blockquote`, `body`,
|
||||
`caption`, `center`, `col`, `colgroup`, `dd`, `details`, `dialog`,
|
||||
@@ -5279,7 +5276,7 @@ well. ([reStructuredText](http://docutils.sourceforge.net/rst.html)
|
||||
takes a different approach, requiring blank lines before lists
|
||||
even inside other list items.)
|
||||
|
||||
In order to solve of unwanted lists in paragraphs with
|
||||
In order to solve the problem of unwanted lists in paragraphs with
|
||||
hard-wrapped numerals, we allow only lists starting with `1` to
|
||||
interrupt paragraphs. Thus,
|
||||
|
||||
@@ -9410,10 +9407,9 @@ character, and a `>` character.
|
||||
A [closing tag](@) consists of the string `</`, a
|
||||
[tag name], optional [whitespace], and the character `>`.
|
||||
|
||||
An [HTML comment](@) consists of `<!--` + *text* + `-->`,
|
||||
where *text* does not start with `>` or `->`, does not end with `-`,
|
||||
and does not contain `--`. (See the
|
||||
[HTML5 spec](http://www.w3.org/TR/html5/syntax.html#comments).)
|
||||
An [HTML comment](@) consists of `<!-->`, `<!--->`, or `<!--`, a string of
|
||||
characters not including the string `-->`, and `-->` (see the
|
||||
[HTML spec](https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state)).
|
||||
|
||||
A [processing instruction](@)
|
||||
consists of the string `<?`, a string
|
||||
@@ -9554,30 +9550,20 @@ Illegal attributes in closing tag:
|
||||
Comments:
|
||||
|
||||
```````````````````````````````` example
|
||||
foo <!-- this is a
|
||||
comment - with hyphen -->
|
||||
foo <!-- this is a --
|
||||
comment - with hyphens -->
|
||||
.
|
||||
<p>foo <!-- this is a
|
||||
comment - with hyphen --></p>
|
||||
<p>foo <!-- this is a --
|
||||
comment - with hyphens --></p>
|
||||
````````````````````````````````
|
||||
|
||||
|
||||
```````````````````````````````` example
|
||||
foo <!-- not a comment -- two hyphens -->
|
||||
.
|
||||
<p>foo <!-- not a comment -- two hyphens --></p>
|
||||
````````````````````````````````
|
||||
|
||||
|
||||
Not comments:
|
||||
|
||||
```````````````````````````````` example
|
||||
foo <!--> foo -->
|
||||
|
||||
foo <!-- foo--->
|
||||
foo <!---> foo -->
|
||||
.
|
||||
<p>foo <!--> foo --></p>
|
||||
<p>foo <!-- foo---></p>
|
||||
<p>foo <!--> foo --></p>
|
||||
<p>foo <!---> foo --></p>
|
||||
````````````````````````````````
|
||||
|
||||
|
||||
|
||||
@@ -12,7 +12,7 @@ RUN apt-get update && apt-get install -y \
|
||||
wget \
|
||||
clang \
|
||||
man \
|
||||
clang-format-3.5 \
|
||||
clang-format \
|
||||
&& apt-get clean
|
||||
|
||||
RUN wget http://lcamtuf.coredump.cx/afl/releases/afl-latest.tgz && \
|
||||
|
||||
Reference in New Issue
Block a user