Fix 8-bit codepoint escape in parse_engine.def.hpp, fix tests

This commit is contained in:
mutativesystems
2025-09-15 16:10:18 +07:00
parent c90f8b3615
commit 8fef3314bf
3 changed files with 24 additions and 13 deletions

View File

@@ -895,7 +895,7 @@ ja: 惑星(ガス)
zh:
# UTF8 decoding only happens in double-quoted strings,
# as per the YAML standard
decode this: "\u263A \xE2\x98\xBA"
decode this: "\u263A c\x61f\xE9"
and this as well: "\u2705 \U0001D11E"
not decoded: '\u263A \xE2\x98\xBA'
neither this: '\u2705 \U0001D11E'
@@ -909,7 +909,7 @@ neither this: '\u2705 \U0001D11E'
// and \x \u \U codepoints are decoded, but only when they appear
// inside double-quoted strings, as dictated by the YAML
// standard:
CHECK(langs["decode this"].val() == "");
CHECK(langs["decode this"].val() == "café");
CHECK(langs["and this as well"].val() == "✅ 𝄞");
CHECK(langs["not decoded"].val() == "\\u263A \\xE2\\x98\\xBA");
CHECK(langs["neither this"].val() == "\\u2705 \\U0001D11E");

View File

@@ -2661,19 +2661,24 @@ void ParseEngine<EventHandler>::_filter_dquoted_backslash(FilterProcessor &C4_RE
{
proc.translate_esc('\\');
}
else if(next == 'x') // UTF8
else if(next == 'x') // 2-digit Unicode escape (\xXX), code point 0x000xFF
{
if(C4_UNLIKELY(proc.rpos + 1u + 2u >= proc.src.len))
_c4err("\\x requires 2 hex digits. scalar pos={}", proc.rpos);
char readbuf[8];
csubstr codepoint = proc.src.sub(proc.rpos + 2u, 2u);
_c4dbgfdq("utf8 ~~~{}~~~ rpos={} rem=~~~{}~~~", codepoint, proc.rpos, proc.src.sub(proc.rpos));
uint8_t byteval = {};
if(C4_UNLIKELY(!read_hex(codepoint, &byteval)))
uint32_t codepoint_val = {};
if(C4_UNLIKELY(!read_hex(codepoint, &codepoint_val)))
_c4err("failed to read \\x codepoint. scalar pos={}", proc.rpos);
proc.translate_esc_bulk((const char*)&byteval, 1u, /*nread*/3u);
const size_t numbytes = decode_code_point((uint8_t*)readbuf, sizeof(readbuf), codepoint_val);
if(C4_UNLIKELY(numbytes == 0))
_c4err("failed to decode code point={}", proc.rpos);
_RYML_CB_ASSERT(callbacks(), numbytes <= 4);
proc.translate_esc_bulk(readbuf, numbytes, /*nread*/3u);
_c4dbgfdq("utf8 after rpos={} rem=~~~{}~~~", proc.rpos, proc.src.sub(proc.rpos));
}
else if(next == 'u') // UTF16
else if(next == 'u') // 4-digit Unicode escape (\uXXXX), code point 0x00000xFFFF
{
if(C4_UNLIKELY(proc.rpos + 1u + 4u >= proc.src.len))
_c4err("\\u requires 4 hex digits. scalar pos={}", proc.rpos);
@@ -2688,7 +2693,7 @@ void ParseEngine<EventHandler>::_filter_dquoted_backslash(FilterProcessor &C4_RE
_RYML_CB_ASSERT(callbacks(), numbytes <= 4);
proc.translate_esc_bulk(readbuf, numbytes, /*nread*/5u);
}
else if(next == 'U') // UTF32
else if(next == 'U') // 8-digit Unicode escape (\UXXXXXXXX), full 32-bit code point
{
if(C4_UNLIKELY(proc.rpos + 1u + 8u >= proc.src.len))
_c4err("\\U requires 8 hex digits. scalar pos={}", proc.rpos);

View File

@@ -294,34 +294,40 @@ dquoted_case test_cases_filter[] = {
// 50
dqc(R"(\P\P\P\P)", dqesc_P4),
dqc(R"(\\\"\n\r\t\ \/\ \0\b\f\a\v\e\_\N\L\P)", dqescparsed),
dqc(R"(\xE4)", R"(ä)"),
dqc(R"(\xD7)", R"(×)"),
dqc(R"(\xA9)", R"(©)"),
// 55
dqc(R"(\xB5)", R"(µ)"),
dqc(R"(\xF7)", R"(÷)"),
dqc(R"(\u263A)", R"(☺)"),
dqc(R"(\u263a)", R"(☺)"),
dqc(R"(\u2705)", R"(✅)"),
// 55
// 60
dqc(R"(\u2705\u2705)", R"(✅✅)"),
dqc(R"(\u2705\u2705\u2705)", R"(✅✅✅)"),
dqc(R"(\u2705\u2705\u2705\u2705)", R"(✅✅✅✅)"),
dqc(R"(\U0001D11E)", R"(𝄞)"),
dqc(R"(\U0001d11e)", R"(𝄞)"),
// 60
// 65
dqc(R"(\U0001d11e\U0001D11E)", R"(𝄞𝄞)"),
dqc(R"(\U0001d11e\U0001D11E\U0001D11E)", R"(𝄞𝄞𝄞)"),
dqc(R"(\U0001d11e\U0001D11E\U0001D11E\U0001D11E)", R"(𝄞𝄞𝄞𝄞)"),
dqc(R"(\u263A\u2705\U0001D11E)", R"(☺✅𝄞)"),
dqc(R"(\b1998\t1999\t2000\n)", "\b1998\t1999\t2000\n"),
// 65
// 70
dqc(R"(\x0d\x0a is \r\n)", "\r\n is \r\n"),
dqc("\n foo\n\n bar\n\n baz\n", " foo\nbar\nbaz "),
dqc(" 1st non-empty\n\n 2nd non-empty \n 3rd non-empty ", " 1st non-empty\n2nd non-empty 3rd non-empty "),
dqc(" 1st non-empty\n\n 2nd non-empty \n 3rd non-empty ", " 1st non-empty\n2nd non-empty 3rd non-empty "),
dqc(" 1st non-empty\n\n 2nd non-empty \n 3rd non-empty ", " 1st non-empty\n2nd non-empty 3rd non-empty "),
// 70
// 75
dqc(" 1st non-empty\n\n 2nd non-empty \n 3rd non-empty ", " 1st non-empty\n2nd non-empty 3rd non-empty "),
dqc("\n ", " "),
dqc(" \n ", " "),
dqc("\n\n ", "\n"),
dqc("\n\n\n ", "\n\n"),
// 75
// 80
dqc("folded \nto a space, \n \nto a line feed, or \\\n \\ non-content", "folded to a space,\nto a line feed, or \t \tnon-content"),
dqc("folded \nto a space,\n \nto a line feed, or \\\n \\ non-content", "folded to a space,\nto a line feed, or \t \tnon-content"),
//dqc(" \n\ndetected\n\n", "\t\ndetected\n"), // this case cannot be prefixed with anything.