Fix 8-bit codepoint escape in parse_engine.def.hpp, fix tests

2026-01-18 21:41:18 +01:00 · 2025-09-15 16:10:18 +07:00
parent c90f8b3615
commit 8fef3314bf
3 changed files with 24 additions and 13 deletions
--- a/samples/quickstart.cpp
+++ b/samples/quickstart.cpp
@@ -895,7 +895,7 @@ ja: 惑星（ガス）
 zh: 行星（气体）
 # UTF8 decoding only happens in double-quoted strings,
 # as per the YAML standard
-decode this: "\u263A \xE2\x98\xBA"
+decode this: "\u263A c\x61f\xE9"
 and this as well: "\u2705 \U0001D11E"
 not decoded: '\u263A \xE2\x98\xBA'
 neither this: '\u2705 \U0001D11E'
@@ -909,7 +909,7 @@ neither this: '\u2705 \U0001D11E'
    // and \x \u \U codepoints are decoded, but only when they appear
    // inside double-quoted strings, as dictated by the YAML
    // standard:
-    CHECK(langs["decode this"].val() == "☺ ☺");
+    CHECK(langs["decode this"].val() == "☺ café");
    CHECK(langs["and this as well"].val() == "✅ 𝄞");
    CHECK(langs["not decoded"].val() == "\\u263A \\xE2\\x98\\xBA");
    CHECK(langs["neither this"].val() == "\\u2705 \\U0001D11E");
--- a/src/c4/yml/parse_engine.def.hpp
+++ b/src/c4/yml/parse_engine.def.hpp
@@ -2661,19 +2661,24 @@ void ParseEngine<EventHandler>::_filter_dquoted_backslash(FilterProcessor &C4_RE
    {
        proc.translate_esc('\\');
    }
-    else if(next == 'x') // UTF8
+    else if(next == 'x') // 2-digit Unicode escape (\xXX), code point 0x00–0xFF
    {
        if(C4_UNLIKELY(proc.rpos + 1u + 2u >= proc.src.len))
            _c4err("\\x requires 2 hex digits. scalar pos={}", proc.rpos);
+        char readbuf[8];
        csubstr codepoint = proc.src.sub(proc.rpos + 2u, 2u);
        _c4dbgfdq("utf8 ~~~{}~~~ rpos={} rem=~~~{}~~~", codepoint, proc.rpos, proc.src.sub(proc.rpos));
-        uint8_t byteval = {};
-        if(C4_UNLIKELY(!read_hex(codepoint, &byteval)))
+        uint32_t codepoint_val = {};
+        if(C4_UNLIKELY(!read_hex(codepoint, &codepoint_val)))
            _c4err("failed to read \\x codepoint. scalar pos={}", proc.rpos);
-        proc.translate_esc_bulk((const char*)&byteval, 1u, /*nread*/3u);
+        const size_t numbytes = decode_code_point((uint8_t*)readbuf, sizeof(readbuf), codepoint_val);
+        if(C4_UNLIKELY(numbytes == 0))
+            _c4err("failed to decode code point={}", proc.rpos);
+        _RYML_CB_ASSERT(callbacks(), numbytes <= 4);
+        proc.translate_esc_bulk(readbuf, numbytes, /*nread*/3u);
        _c4dbgfdq("utf8 after rpos={} rem=~~~{}~~~", proc.rpos, proc.src.sub(proc.rpos));
    }
-    else if(next == 'u') // UTF16
+    else if(next == 'u') // 4-digit Unicode escape (\uXXXX), code point 0x0000–0xFFFF
    {
        if(C4_UNLIKELY(proc.rpos + 1u + 4u >= proc.src.len))
            _c4err("\\u requires 4 hex digits. scalar pos={}", proc.rpos);
@@ -2688,7 +2693,7 @@ void ParseEngine<EventHandler>::_filter_dquoted_backslash(FilterProcessor &C4_RE
        _RYML_CB_ASSERT(callbacks(), numbytes <= 4);
        proc.translate_esc_bulk(readbuf, numbytes, /*nread*/5u);
    }
-    else if(next == 'U') // UTF32
+    else if(next == 'U') // 8-digit Unicode escape (\UXXXXXXXX), full 32-bit code point
    {
        if(C4_UNLIKELY(proc.rpos + 1u + 8u >= proc.src.len))
            _c4err("\\U requires 8 hex digits. scalar pos={}", proc.rpos);
--- a/test/test_scalar_dquoted.cpp
+++ b/test/test_scalar_dquoted.cpp
@@ -294,34 +294,40 @@ dquoted_case test_cases_filter[] = {
    // 50
    dqc(R"(\P\P\P\P)", dqesc_P4),
    dqc(R"(\\\"\n\r\t\	\/\ \0\b\f\a\v\e\_\N\L\P)", dqescparsed),
+    dqc(R"(\xE4)", R"(ä)"),
+    dqc(R"(\xD7)", R"(×)"),
+    dqc(R"(\xA9)", R"(©)"),
+    // 55
+    dqc(R"(\xB5)", R"(µ)"),
+    dqc(R"(\xF7)", R"(÷)"),
    dqc(R"(\u263A)", R"(☺)"),
    dqc(R"(\u263a)", R"(☺)"),
    dqc(R"(\u2705)", R"(✅)"),
-    // 55
+    // 60
    dqc(R"(\u2705\u2705)", R"(✅✅)"),
    dqc(R"(\u2705\u2705\u2705)", R"(✅✅✅)"),
    dqc(R"(\u2705\u2705\u2705\u2705)", R"(✅✅✅✅)"),
    dqc(R"(\U0001D11E)", R"(𝄞)"),
    dqc(R"(\U0001d11e)", R"(𝄞)"),
-    // 60
+    // 65
    dqc(R"(\U0001d11e\U0001D11E)", R"(𝄞𝄞)"),
    dqc(R"(\U0001d11e\U0001D11E\U0001D11E)", R"(𝄞𝄞𝄞)"),
    dqc(R"(\U0001d11e\U0001D11E\U0001D11E\U0001D11E)", R"(𝄞𝄞𝄞𝄞)"),
    dqc(R"(\u263A\u2705\U0001D11E)", R"(☺✅𝄞)"),
    dqc(R"(\b1998\t1999\t2000\n)", "\b1998\t1999\t2000\n"),
-    // 65
+    // 70
    dqc(R"(\x0d\x0a is \r\n)", "\r\n is \r\n"),
    dqc("\n  foo\n\n    bar\n\n  baz\n", " foo\nbar\nbaz "),
    dqc(" 1st non-empty\n\n 2nd non-empty \n 3rd non-empty ", " 1st non-empty\n2nd non-empty 3rd non-empty "),
    dqc(" 1st non-empty\n\n 2nd non-empty \n	3rd non-empty ", " 1st non-empty\n2nd non-empty 3rd non-empty "),
    dqc(" 1st non-empty\n\n 2nd non-empty 	\n 	3rd non-empty ", " 1st non-empty\n2nd non-empty 3rd non-empty "),
-    // 70
+    // 75
    dqc(" 1st non-empty\n\n 2nd non-empty	 \n	3rd non-empty ", " 1st non-empty\n2nd non-empty 3rd non-empty "),
    dqc("\n  ", " "),
    dqc("  \n  ", " "),
    dqc("\n\n  ", "\n"),
    dqc("\n\n\n  ", "\n\n"),
-    // 75
+    // 80
    dqc("folded \nto a space,	\n \nto a line feed, or 	\\\n \\ 	non-content", "folded to a space,\nto a line feed, or \t \tnon-content"),
    dqc("folded \nto a space,\n \nto a line feed, or 	\\\n \\ 	non-content", "folded to a space,\nto a line feed, or \t \tnon-content"),
    //dqc("	\n\ndetected\n\n", "\t\ndetected\n"), // this case cannot be prefixed with anything.