mirror of
https://github.com/biojppm/rapidyaml.git
synced 2026-01-18 21:41:18 +01:00
Fix 8-bit codepoint escape in parse_engine.def.hpp, fix tests
This commit is contained in:
@@ -895,7 +895,7 @@ ja: 惑星(ガス)
|
||||
zh: 行星(气体)
|
||||
# UTF8 decoding only happens in double-quoted strings,
|
||||
# as per the YAML standard
|
||||
decode this: "\u263A \xE2\x98\xBA"
|
||||
decode this: "\u263A c\x61f\xE9"
|
||||
and this as well: "\u2705 \U0001D11E"
|
||||
not decoded: '\u263A \xE2\x98\xBA'
|
||||
neither this: '\u2705 \U0001D11E'
|
||||
@@ -909,7 +909,7 @@ neither this: '\u2705 \U0001D11E'
|
||||
// and \x \u \U codepoints are decoded, but only when they appear
|
||||
// inside double-quoted strings, as dictated by the YAML
|
||||
// standard:
|
||||
CHECK(langs["decode this"].val() == "☺ ☺");
|
||||
CHECK(langs["decode this"].val() == "☺ café");
|
||||
CHECK(langs["and this as well"].val() == "✅ 𝄞");
|
||||
CHECK(langs["not decoded"].val() == "\\u263A \\xE2\\x98\\xBA");
|
||||
CHECK(langs["neither this"].val() == "\\u2705 \\U0001D11E");
|
||||
|
||||
@@ -2661,19 +2661,24 @@ void ParseEngine<EventHandler>::_filter_dquoted_backslash(FilterProcessor &C4_RE
|
||||
{
|
||||
proc.translate_esc('\\');
|
||||
}
|
||||
else if(next == 'x') // UTF8
|
||||
else if(next == 'x') // 2-digit Unicode escape (\xXX), code point 0x00–0xFF
|
||||
{
|
||||
if(C4_UNLIKELY(proc.rpos + 1u + 2u >= proc.src.len))
|
||||
_c4err("\\x requires 2 hex digits. scalar pos={}", proc.rpos);
|
||||
char readbuf[8];
|
||||
csubstr codepoint = proc.src.sub(proc.rpos + 2u, 2u);
|
||||
_c4dbgfdq("utf8 ~~~{}~~~ rpos={} rem=~~~{}~~~", codepoint, proc.rpos, proc.src.sub(proc.rpos));
|
||||
uint8_t byteval = {};
|
||||
if(C4_UNLIKELY(!read_hex(codepoint, &byteval)))
|
||||
uint32_t codepoint_val = {};
|
||||
if(C4_UNLIKELY(!read_hex(codepoint, &codepoint_val)))
|
||||
_c4err("failed to read \\x codepoint. scalar pos={}", proc.rpos);
|
||||
proc.translate_esc_bulk((const char*)&byteval, 1u, /*nread*/3u);
|
||||
const size_t numbytes = decode_code_point((uint8_t*)readbuf, sizeof(readbuf), codepoint_val);
|
||||
if(C4_UNLIKELY(numbytes == 0))
|
||||
_c4err("failed to decode code point={}", proc.rpos);
|
||||
_RYML_CB_ASSERT(callbacks(), numbytes <= 4);
|
||||
proc.translate_esc_bulk(readbuf, numbytes, /*nread*/3u);
|
||||
_c4dbgfdq("utf8 after rpos={} rem=~~~{}~~~", proc.rpos, proc.src.sub(proc.rpos));
|
||||
}
|
||||
else if(next == 'u') // UTF16
|
||||
else if(next == 'u') // 4-digit Unicode escape (\uXXXX), code point 0x0000–0xFFFF
|
||||
{
|
||||
if(C4_UNLIKELY(proc.rpos + 1u + 4u >= proc.src.len))
|
||||
_c4err("\\u requires 4 hex digits. scalar pos={}", proc.rpos);
|
||||
@@ -2688,7 +2693,7 @@ void ParseEngine<EventHandler>::_filter_dquoted_backslash(FilterProcessor &C4_RE
|
||||
_RYML_CB_ASSERT(callbacks(), numbytes <= 4);
|
||||
proc.translate_esc_bulk(readbuf, numbytes, /*nread*/5u);
|
||||
}
|
||||
else if(next == 'U') // UTF32
|
||||
else if(next == 'U') // 8-digit Unicode escape (\UXXXXXXXX), full 32-bit code point
|
||||
{
|
||||
if(C4_UNLIKELY(proc.rpos + 1u + 8u >= proc.src.len))
|
||||
_c4err("\\U requires 8 hex digits. scalar pos={}", proc.rpos);
|
||||
|
||||
@@ -294,34 +294,40 @@ dquoted_case test_cases_filter[] = {
|
||||
// 50
|
||||
dqc(R"(\P\P\P\P)", dqesc_P4),
|
||||
dqc(R"(\\\"\n\r\t\ \/\ \0\b\f\a\v\e\_\N\L\P)", dqescparsed),
|
||||
dqc(R"(\xE4)", R"(ä)"),
|
||||
dqc(R"(\xD7)", R"(×)"),
|
||||
dqc(R"(\xA9)", R"(©)"),
|
||||
// 55
|
||||
dqc(R"(\xB5)", R"(µ)"),
|
||||
dqc(R"(\xF7)", R"(÷)"),
|
||||
dqc(R"(\u263A)", R"(☺)"),
|
||||
dqc(R"(\u263a)", R"(☺)"),
|
||||
dqc(R"(\u2705)", R"(✅)"),
|
||||
// 55
|
||||
// 60
|
||||
dqc(R"(\u2705\u2705)", R"(✅✅)"),
|
||||
dqc(R"(\u2705\u2705\u2705)", R"(✅✅✅)"),
|
||||
dqc(R"(\u2705\u2705\u2705\u2705)", R"(✅✅✅✅)"),
|
||||
dqc(R"(\U0001D11E)", R"(𝄞)"),
|
||||
dqc(R"(\U0001d11e)", R"(𝄞)"),
|
||||
// 60
|
||||
// 65
|
||||
dqc(R"(\U0001d11e\U0001D11E)", R"(𝄞𝄞)"),
|
||||
dqc(R"(\U0001d11e\U0001D11E\U0001D11E)", R"(𝄞𝄞𝄞)"),
|
||||
dqc(R"(\U0001d11e\U0001D11E\U0001D11E\U0001D11E)", R"(𝄞𝄞𝄞𝄞)"),
|
||||
dqc(R"(\u263A\u2705\U0001D11E)", R"(☺✅𝄞)"),
|
||||
dqc(R"(\b1998\t1999\t2000\n)", "\b1998\t1999\t2000\n"),
|
||||
// 65
|
||||
// 70
|
||||
dqc(R"(\x0d\x0a is \r\n)", "\r\n is \r\n"),
|
||||
dqc("\n foo\n\n bar\n\n baz\n", " foo\nbar\nbaz "),
|
||||
dqc(" 1st non-empty\n\n 2nd non-empty \n 3rd non-empty ", " 1st non-empty\n2nd non-empty 3rd non-empty "),
|
||||
dqc(" 1st non-empty\n\n 2nd non-empty \n 3rd non-empty ", " 1st non-empty\n2nd non-empty 3rd non-empty "),
|
||||
dqc(" 1st non-empty\n\n 2nd non-empty \n 3rd non-empty ", " 1st non-empty\n2nd non-empty 3rd non-empty "),
|
||||
// 70
|
||||
// 75
|
||||
dqc(" 1st non-empty\n\n 2nd non-empty \n 3rd non-empty ", " 1st non-empty\n2nd non-empty 3rd non-empty "),
|
||||
dqc("\n ", " "),
|
||||
dqc(" \n ", " "),
|
||||
dqc("\n\n ", "\n"),
|
||||
dqc("\n\n\n ", "\n\n"),
|
||||
// 75
|
||||
// 80
|
||||
dqc("folded \nto a space, \n \nto a line feed, or \\\n \\ non-content", "folded to a space,\nto a line feed, or \t \tnon-content"),
|
||||
dqc("folded \nto a space,\n \nto a line feed, or \\\n \\ non-content", "folded to a space,\nto a line feed, or \t \tnon-content"),
|
||||
//dqc(" \n\ndetected\n\n", "\t\ndetected\n"), // this case cannot be prefixed with anything.
|
||||
|
||||
Reference in New Issue
Block a user