From a8a56a0d1b0c949ee59224282569c991a914da46 Mon Sep 17 00:00:00 2001 From: Ivan Romanov Date: Mon, 5 Jan 2026 22:37:19 +0500 Subject: [PATCH] Add WTF-8 support Serialize invalid UTF-8 (WTF-8) symbols and some other invisible symbols as \u escaped sequences. --- json-builder.c | 86 ++++++++++++++++++++++++++++++++++++++++++-- test/main.cc | 1 + test/valid-0013.json | 14 ++++++++ 3 files changed, 99 insertions(+), 2 deletions(-) create mode 100644 test/valid-0013.json diff --git a/json-builder.c b/json-builder.c index 2b96e6d..30d1817 100644 --- a/json-builder.c +++ b/json-builder.c @@ -480,7 +480,39 @@ static size_t measure_string (unsigned int length, default: - ++ measured_length; + if ((unsigned char)c <= 0x1F) + { + measured_length += 6; + } + else if ((unsigned char)c == 0xED && i + 2 < length) + { + unsigned char c2 = (unsigned char)str [i + 1]; + unsigned char c3 = (unsigned char)str [i + 2]; + + if (c2 == 0x80 && (c3 == 0xA8 || c3 == 0xA9)) + { + /* U+2028 line separator, U+2029 paragraph separator */ + measured_length += 6; + } + else if (c2 == 0xBF && (c3 == 0xBE || c3 == 0xBF)) + { + /* Noncharacters U+FFFE / U+FFFF */ + measured_length += 6; + } + else if ((c2 >= 0xA0 && c2 <= 0xBF) && (c3 >= 0x80 && c3 <= 0xBF)) + { + /* Decode WTF-8 unpaired surrogate */ + measured_length += 6; + } + else + { + measured_length++; + } + } + else + { + measured_length++; + } break; }; }; @@ -493,6 +525,16 @@ static size_t measure_string (unsigned int length, *buf ++ = (c); \ } while(0); \ +/* Escape code point cp (0..0xFFFF) as \uXXXX */ +#define PRINT_ESCAPED_CP(cp) do { \ + *buf ++ = '\\'; \ + *buf ++ = 'u'; \ + *buf ++ = hex[((cp) >> 12) & 0xF]; \ + *buf ++ = hex[((cp) >> 8) & 0xF]; \ + *buf ++ = hex[((cp) >> 4) & 0xF]; \ + *buf ++ = hex[(cp) & 0xF]; \ +} while(0) + static size_t serialize_string (json_char * buf, unsigned int length, const json_char * str) @@ -500,6 +542,8 @@ static size_t serialize_string (json_char * buf, json_char * orig_buf = buf; unsigned int i; + static const char hex[] = "0123456789ABCDEF"; + for(i = 0; i < length; ++ i) { json_char c = str [i]; @@ -516,7 +560,45 @@ static size_t serialize_string (json_char * buf, default: - *buf ++ = c; + if ((unsigned char)c <= 0x1F) + { + PRINT_ESCAPED_CP (c); + } + else if ((unsigned char)c == 0xED && i + 2 < length) + { + unsigned char c2 = (unsigned char)str [i + 1]; + unsigned char c3 = (unsigned char)str [i + 2]; + + if (c2 == 0x80 && (c3 == 0xA8 || c3 == 0xA9)) + { + /* U+2028 line separator, U+2029 paragraph separator */ + unsigned int cp = ((c & 0x0F) << 12) | ((c2 & 0x3F) << 6) | (c3 & 0x3F); + PRINT_ESCAPED_CP (cp); + i += 2; + } + else if (c2 == 0xBF && (c3 == 0xBE || c3 == 0xBF)) + { + /* Noncharacters U+FFFE / U+FFFF */ + unsigned int cp = ((c & 0x0F) << 12) | ((c2 & 0x3F) << 6) | (c3 & 0x3F); + PRINT_ESCAPED_CP (cp); + i += 2; + } + else if ((c2 >= 0xA0 && c2 <= 0xBF) && (c3 >= 0x80 && c3 <= 0xBF)) + { + /* Decode WTF-8 unpaired surrogate */ + unsigned int cp = ((c & 0x0F) << 12) | ((c2 & 0x3F) << 6) | (c3 & 0x3F); + PRINT_ESCAPED_CP (cp); + i += 2; + } + else + { + *buf ++ = c; + } + } + else + { + *buf ++ = c; + } break; }; }; diff --git a/test/main.cc b/test/main.cc index cd25c97..22b7113 100644 --- a/test/main.cc +++ b/test/main.cc @@ -58,6 +58,7 @@ int main (int argc, char * argv []) test_file ("valid-0010.json", &num_failed); test_file ("valid-0011.json", &num_failed); test_file ("valid-0012.json", &num_failed); + test_file ("valid-0013.json", &num_failed); printf ("Total failed tests: %d\n", num_failed); diff --git a/test/valid-0013.json b/test/valid-0013.json new file mode 100644 index 0000000..dbcefa2 --- /dev/null +++ b/test/valid-0013.json @@ -0,0 +1,14 @@ +{ + "valid surrogate pair (😀 U+1F600)": "\uD83D\uDE00", + "lone high surrogate": "\uD800", + "lone low surrogate": "\uDC00", + "high surrogate not followed by low surrogate": "\uD834\u0061", + "low surrogate not preceded by high surrogate": "\u0061\uDD1E", + "reversed surrogate order (low then high)": "\uDC00\uD800", + "two high surrogates in a row": "\uD800\uD801", + "two low surrogates in a row": "\uDC00\uDC01", + "surrogate pair split by space": "\uD83D\u0020\uDE00", + "surrogate halves separated by text": "\uD83Dtest\uDE00", + "high surrogate followed by another escape": "\uD83D\u000A", + "high surrogate at end of string": "ABC\uD800" +}