From 0d95c02a0c2747cccb9ddd2df336f792610a3a45 Mon Sep 17 00:00:00 2001 From: Slurmlord Date: Fri, 19 Dec 2025 11:15:39 +0100 Subject: [PATCH 1/4] Add base UTF8 functions to WWLib --- .../GameSpy/Thread/ThreadUtils.cpp | 17 +- .../Source/WWVegas/WWLib/CMakeLists.txt | 2 + Core/Libraries/Source/WWVegas/WWLib/utf8.cpp | 291 ++++++++++++++++++ Core/Libraries/Source/WWVegas/WWLib/utf8.h | 47 +++ 4 files changed, 348 insertions(+), 9 deletions(-) create mode 100644 Core/Libraries/Source/WWVegas/WWLib/utf8.cpp create mode 100644 Core/Libraries/Source/WWVegas/WWLib/utf8.h diff --git a/Core/GameEngine/Source/GameNetwork/GameSpy/Thread/ThreadUtils.cpp b/Core/GameEngine/Source/GameNetwork/GameSpy/Thread/ThreadUtils.cpp index fedf8d86988..5b0b0ee26c8 100644 --- a/Core/GameEngine/Source/GameNetwork/GameSpy/Thread/ThreadUtils.cpp +++ b/Core/GameEngine/Source/GameNetwork/GameSpy/Thread/ThreadUtils.cpp @@ -27,15 +27,16 @@ // Author: Matthew D. Campbell, July 2002 #include "PreRTS.h" // This must go first in EVERY cpp file in the GameEngine +#include "utf8.h" //------------------------------------------------------------------------- std::wstring MultiByteToWideCharSingleLine( const char *orig ) { - Int len = strlen(orig); - WideChar *dest = NEW WideChar[len+1]; + const size_t size = get_size_as_widechar( orig ); + WideChar *dest = NEW WideChar[size / sizeof(WideChar)]; - MultiByteToWideChar(CP_UTF8, 0, orig, -1, dest, len); + convert_utf8_to_widechar( orig, dest, size ); WideChar *c = NULL; do { @@ -56,7 +57,6 @@ std::wstring MultiByteToWideCharSingleLine( const char *orig ) } while ( c != NULL ); - dest[len] = 0; std::wstring ret = dest; delete[] dest; return ret; @@ -65,12 +65,11 @@ std::wstring MultiByteToWideCharSingleLine( const char *orig ) std::string WideCharStringToMultiByte( const WideChar *orig ) { std::string ret; - Int len = WideCharToMultiByte( CP_UTF8, 0, orig, wcslen(orig), NULL, 0, NULL, NULL ) + 1; - if (len > 0) + const size_t size = get_size_as_utf8( orig ); + if (size > 0) { - char *dest = NEW char[len]; - WideCharToMultiByte( CP_UTF8, 0, orig, -1, dest, len, NULL, NULL ); - dest[len-1] = 0; + char *dest = NEW char[size]; + convert_widechar_to_utf8( orig, dest, size ); ret = dest; delete[] dest; } diff --git a/Core/Libraries/Source/WWVegas/WWLib/CMakeLists.txt b/Core/Libraries/Source/WWVegas/WWLib/CMakeLists.txt index 997a6c02420..70a7bbc2f49 100644 --- a/Core/Libraries/Source/WWVegas/WWLib/CMakeLists.txt +++ b/Core/Libraries/Source/WWVegas/WWLib/CMakeLists.txt @@ -133,6 +133,8 @@ set(WWLIB_SRC trim.cpp trim.h uarray.h + utf8.h + utf8.cpp vector.cpp Vector.h visualc.h diff --git a/Core/Libraries/Source/WWVegas/WWLib/utf8.cpp b/Core/Libraries/Source/WWVegas/WWLib/utf8.cpp new file mode 100644 index 00000000000..28a9af5ff40 --- /dev/null +++ b/Core/Libraries/Source/WWVegas/WWLib/utf8.cpp @@ -0,0 +1,291 @@ +/* +** Command & Conquer Generals Zero Hour(tm) +** Copyright 2025 TheSuperHackers +** +** This program is free software: you can redistribute it and/or modify +** it under the terms of the GNU General Public License as published by +** the Free Software Foundation, either version 3 of the License, or +** (at your option) any later version. +** +** This program is distributed in the hope that it will be useful, +** but WITHOUT ANY WARRANTY; without even the implied warranty of +** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +** GNU General Public License for more details. +** +** You should have received a copy of the GNU General Public License +** along with this program. If not, see . +*/ + +#include "always.h" +#include "utf8.h" + + +// Returns true if c is a single-byte UTF-8 character (i.e., 0xxxxxxx). +static bool utf8_is_single_byte(char c) +{ + return (c & 0x80) == 0x0; +} + +// Returns true if c is a UTF-8 lead byte (i.e., 110xxxxx, 1110xxxx, or 11110xxx). +static bool utf8_is_lead_byte(char c) +{ + unsigned char uc = static_cast(c); + return uc >= 0xC2 && uc <= 0xF7; +} + +// Returns true if c is a UTF-8 trail byte (i.e., 10xxxxxx). +static bool utf8_is_trail_byte(char c) +{ + return (c & 0xC0) == 0x80; +} + +// Returns true if c is an invalid UTF-8 value (overlong encoding or 5/6 byte sequences). +static bool utf8_is_invalid_value(char c) +{ + unsigned char uc = static_cast(c); + return uc == 0xC0 || uc == 0xC1 || uc >= 0xF8; +} + +size_t utf8_num_bytes(char lead) +{ + if (utf8_is_single_byte(lead)) + { + return 1; + } + + if (utf8_is_lead_byte(lead)) + { + if ((lead & 0xE0) == 0xC0) + { + return 2; + } + else if ((lead & 0xF0) == 0xE0) + { + return 3; + } + else if ((lead & 0xF8) == 0xF0) + { + return 4; + } + } + + // Invalid lead byte + return 0; +} + +// Validates a 2-byte UTF-8 sequence given the lead byte and one trail byte. +static bool utf8_is_valid_lead_and_trail_2(char lead, char trail) +{ + return utf8_is_lead_byte(lead) && utf8_num_bytes(lead) == 2 && utf8_is_trail_byte(trail); +} + +// Validates a 3-byte UTF-8 sequence given the lead byte and two trail bytes. +static bool utf8_is_valid_lead_and_trail_3(char lead, char trail1, char trail2) +{ + return utf8_is_lead_byte(lead) && utf8_num_bytes(lead) == 3 && utf8_is_trail_byte(trail1) && utf8_is_trail_byte(trail2); +} + +// Validates a 4-byte UTF-8 sequence given the lead byte and three trail bytes. +static bool utf8_is_valid_lead_and_trail_4(char lead, char trail1, char trail2, char trail3) +{ + return utf8_is_lead_byte(lead) && utf8_num_bytes(lead) == 4 && utf8_is_trail_byte(trail1) && utf8_is_trail_byte(trail2) && utf8_is_trail_byte(trail3); +} + +bool utf8_validate_string(const char* str) +{ + if (!str) + { + WWDEBUG_WARNING(("utf8_validate_string: str is null")); + return false; + } + + size_t len = strlen_t(str); + return utf8_validate_string(str, len); +} + +bool utf8_validate_string(const char* str, const size_t length) +{ + if (!str) + { + WWDEBUG_WARNING(("utf8_validate_string: str is null")); + return false; + } + + size_t i = 0; + while (i < length) + { + const char c = str[i]; + + if (utf8_is_single_byte(c)) + { + i++; + } + else if (utf8_is_invalid_value(c)) + { + // Invalid byte value + WWDEBUG_WARNING(("utf8_validate_string: failed, offset=%u, invalid byte value", i)); + return false; + } + else if (utf8_is_lead_byte(c)) + { + const size_t num_bytes = utf8_num_bytes(c); + + if (num_bytes == 0 || i + num_bytes > length) + { + WWDEBUG_WARNING(("utf8_validate_string: failed, offset=%u, num_bytes=%u", i, num_bytes)); + return false; + } + + if (num_bytes == 2) + { + if (!utf8_is_valid_lead_and_trail_2(c, str[i + 1])) + { + WWDEBUG_WARNING(("utf8_validate_string: failed, offset=%u, invalid lead and trail 2", i)); + return false; + } + } + else if (num_bytes == 3) + { + if (!utf8_is_valid_lead_and_trail_3(c, str[i + 1], str[i + 2])) + { + WWDEBUG_WARNING(("utf8_validate_string: failed, offset=%u, invalid lead and trail 3", i)); + return false; + } + } + else if (num_bytes == 4) + { + if (!utf8_is_valid_lead_and_trail_4(c, str[i + 1], str[i + 2], str[i + 3])) + { + WWDEBUG_WARNING(("utf8_validate_string: failed, offset=%u, invalid lead and trail 4", i)); + return false; + } + } + + i += num_bytes; + } + else + { + // Invalid byte (trail byte without lead) + WWDEBUG_WARNING(("utf8_validate_string: failed, offset=%u, trail without lead", i)); + return false; + } + } + + return true; +} + +int utf8_truncate_if_incomplete(const char* str, size_t length) +{ + if (length == 0) + { + return 0; + } + + char c = str[length - 1]; + if (utf8_is_single_byte(c)) + { + // No truncation needed + return 0; + } + else if (utf8_is_lead_byte(c)) + { + // Incomplete sequence; Single lead byte at the end + return 1; + } + else if (utf8_is_trail_byte(c) && (length > 1)) + { + char c2 = str[length - 2]; + if (utf8_is_lead_byte(c2)) + { + if (utf8_is_valid_lead_and_trail_2(c2, c)) + { + return 0; + } + + // Incomplete sequence + return 2; + } + else if (utf8_is_trail_byte(c2) && (length > 2)) + { + char c3 = str[length - 3]; + if (utf8_is_lead_byte(c3)) + { + if (utf8_is_valid_lead_and_trail_3(c3, c2, c)) + { + return 0; + } + + // Incomplete sequence + return 3; + } + else if (utf8_is_trail_byte(c3) && (length > 3)) + { + char c4 = str[length - 4]; + if (utf8_is_lead_byte(c4)) + { + if (utf8_is_valid_lead_and_trail_4(c4, c3, c2, c)) + { + return 0; + } + + // Invalid sequence + return 4; + } + } + } + } + + // The sequence is incomplete - for example of length 2 and both are trailing bytes, + // so we can't determine how many bytes to truncate. + return 0; +} + +#ifdef _WIN32 +size_t get_size_as_utf8(const wchar_t* s) +{ + int reqBytes = WideCharToMultiByte(CP_UTF8, 0, s, -1, nullptr, 0, nullptr, nullptr); + if (reqBytes == 0) + { + WWDEBUG_WARNING(("get_size_as_utf8: WideCharToMultiByte failed with error: %u", GetLastError())); + } + + return reqBytes; +} + +size_t get_size_as_widechar(const char* s) +{ + int reqChars = MultiByteToWideChar(CP_UTF8, 0, s, -1, nullptr, 0); + if (reqChars == 0) + { + WWDEBUG_WARNING(("get_size_as_widechar: MultiByteToWideChar failed with error: %u", GetLastError())); + } + + return reqChars * sizeof(wchar_t); +} + +size_t convert_widechar_to_utf8(const wchar_t* orig, char* tgt, size_t tgtsize) +{ + WWASSERT_PRINT(tgtsize >= get_size_as_utf8(orig), "Insufficient buffer for UTF8 conversion"); + int bytesWritten = WideCharToMultiByte(CP_UTF8, 0, orig, -1, tgt, tgtsize, NULL, NULL); + if (bytesWritten == 0) + { + WWDEBUG_WARNING(("convert_widechar_to_utf8: WideCharToMultiByte failed with error: %u", GetLastError())); + } + + return bytesWritten; +} + +size_t convert_utf8_to_widechar(const char* orig, wchar_t* tgt, size_t tgtsize) +{ + WWASSERT_PRINT(tgtsize >= get_size_as_widechar(orig), "Insufficient buffer for widechar conversion"); + int charsWritten = MultiByteToWideChar(CP_UTF8, 0, orig, -1, tgt, static_cast(tgtsize / sizeof(wchar_t))); + if (charsWritten == 0) + { + WWDEBUG_WARNING(("convert_utf8_to_widechar: MultiByteToWideChar failed with error: %u", GetLastError())); + } + + return charsWritten * sizeof(wchar_t); +} +#else +#error "UTF-8 conversion functions not implemented for this platform" +#endif diff --git a/Core/Libraries/Source/WWVegas/WWLib/utf8.h b/Core/Libraries/Source/WWVegas/WWLib/utf8.h new file mode 100644 index 00000000000..85ae1b716f5 --- /dev/null +++ b/Core/Libraries/Source/WWVegas/WWLib/utf8.h @@ -0,0 +1,47 @@ +/* +** Command & Conquer Generals Zero Hour(tm) +** Copyright 2025 TheSuperHackers +** +** This program is free software: you can redistribute it and/or modify +** it under the terms of the GNU General Public License as published by +** the Free Software Foundation, either version 3 of the License, or +** (at your option) any later version. +** +** This program is distributed in the hope that it will be useful, +** but WITHOUT ANY WARRANTY; without even the implied warranty of +** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +** GNU General Public License for more details. +** +** You should have received a copy of the GNU General Public License +** along with this program. If not, see . +*/ + +#pragma once + +#include "always.h" + +// Single-character functions + +// Returns the number of bytes in a UTF-8 character based on the lead byte. Returns 0 if invalid lead byte. +size_t utf8_num_bytes(char lead); +// Returns number of bytes to truncate if str contains incomplete UTF-8 character at the end. +// 0 if no truncation needed. Assumes correct UTF-8 up to length - 1. +int utf8_truncate_if_incomplete(const char* str, size_t length); + +// Validation functions + +// Validates whether the given string is valid UTF-8. +bool utf8_validate_string(const char* str); +// Validates whether the given string is valid UTF-8 up to the specified length. +bool utf8_validate_string(const char* str, const size_t length); + +// Conversion functions + +// Gets the size in bytes required to hold the UTF-8 representation of the given widechar string, including null terminator. +size_t get_size_as_utf8(const wchar_t* s); +// Gets the size in bytes required to hold the widechar representation of the given UTF-8 string, including null terminator. +size_t get_size_as_widechar(const char* s); +// Converts a widechar string to UTF-8. Assumes tgt has enough space (use get_size_as_utf8 to determine size). +size_t convert_widechar_to_utf8(const wchar_t* orig, char* tgt, size_t tgtsize); +// Converts a UTF-8 string to widechar. Assumes tgt has enough space (use get_size_as_widechar to determine size). +size_t convert_utf8_to_widechar(const char* orig, wchar_t* tgt, size_t tgtsize); From 4c3a537a5c955f126b3a3bf7b1feb50a442fa471 Mon Sep 17 00:00:00 2001 From: Slurmlord Date: Thu, 1 Jan 2026 16:33:40 +0100 Subject: [PATCH 2/4] Add UTF8 functions to AsciiString and UnicodeString --- Core/GameEngine/Include/Common/AsciiString.h | 10 +++++++++ .../GameEngine/Include/Common/UnicodeString.h | 5 +++++ .../Source/Common/System/AsciiString.cpp | 22 +++++++++++++++++++ .../Source/Common/System/UnicodeString.cpp | 16 ++++++++++++++ 4 files changed, 53 insertions(+) diff --git a/Core/GameEngine/Include/Common/AsciiString.h b/Core/GameEngine/Include/Common/AsciiString.h index 5af9cbe130e..436cafb5f41 100644 --- a/Core/GameEngine/Include/Common/AsciiString.h +++ b/Core/GameEngine/Include/Common/AsciiString.h @@ -364,6 +364,16 @@ class AsciiString Bool isNotEmpty() const { return !isEmpty(); } Bool isNotNone() const { return !isNone(); } + /** + return true iff the string is valid UTF-8. + */ + Bool isValidUtf8() const; + + /** + convert the given UnicodeString to UTF-8 and store it in self. + */ + void convertToUtf8(const UnicodeString& unicodeStr); + // // You might think it would be a good idea to overload the * operator // to allow for an implicit conversion to an char*. This is diff --git a/Core/GameEngine/Include/Common/UnicodeString.h b/Core/GameEngine/Include/Common/UnicodeString.h index ff66edfe478..9e5155c48d7 100644 --- a/Core/GameEngine/Include/Common/UnicodeString.h +++ b/Core/GameEngine/Include/Common/UnicodeString.h @@ -340,6 +340,11 @@ class UnicodeString */ Bool nextToken(UnicodeString* token, UnicodeString delimiters = UnicodeString::TheEmptyString); + /** + convert the given UTF-8 encoded AsciiString to WideChar and store it in self. + */ + void convertFromUtf8(const AsciiString& asciiStr); + // // You might think it would be a good idea to overload the * operator // to allow for an implicit conversion to an WideChar*. This is diff --git a/Core/GameEngine/Source/Common/System/AsciiString.cpp b/Core/GameEngine/Source/Common/System/AsciiString.cpp index 31567ec5335..cf9e00a53bc 100644 --- a/Core/GameEngine/Source/Common/System/AsciiString.cpp +++ b/Core/GameEngine/Source/Common/System/AsciiString.cpp @@ -45,6 +45,7 @@ #include "PreRTS.h" // This must go first in EVERY cpp file in the GameEngine #include "Common/CriticalSection.h" +#include "utf8.h" // ----------------------------------------------------- @@ -534,3 +535,24 @@ Bool AsciiString::nextToken(AsciiString* tok, const char* seps) return false; } } + +//----------------------------------------------------------------------------- +Bool AsciiString::isValidUtf8() const +{ + return utf8_validate_string(str()); +} + +//----------------------------------------------------------------------------- +void AsciiString::convertToUtf8(const UnicodeString& unicodeStr) +{ + const size_t requiredBytes = get_size_as_utf8(unicodeStr.str()); + if (requiredBytes == 0) + { + DEBUG_CRASH(("AsciiString::convertToUtf8: failed to get size as UTF-8")); + return; + } + + ensureUniqueBufferOfSize(static_cast(requiredBytes), false, NULL, NULL); + convert_widechar_to_utf8(unicodeStr.str(), peek(), requiredBytes); + validate(); +} diff --git a/Core/GameEngine/Source/Common/System/UnicodeString.cpp b/Core/GameEngine/Source/Common/System/UnicodeString.cpp index af1a6317e3f..be6983babe6 100644 --- a/Core/GameEngine/Source/Common/System/UnicodeString.cpp +++ b/Core/GameEngine/Source/Common/System/UnicodeString.cpp @@ -45,6 +45,7 @@ #include "PreRTS.h" // This must go first in EVERY cpp file in the GameEngine #include "Common/CriticalSection.h" +#include "utf8.h" // ----------------------------------------------------- @@ -465,3 +466,18 @@ Bool UnicodeString::nextToken(UnicodeString* tok, UnicodeString delimiters) return false; } } + +//----------------------------------------------------------------------------- +void UnicodeString::convertFromUtf8(const AsciiString& asciiStr) +{ + const size_t requiredBytes = get_size_as_widechar(asciiStr.str()); + if (requiredBytes == 0) + { + DEBUG_CRASH(("UnicodeString::convertFromUtf8: failed to get size as WideChar")); + return; + } + + ensureUniqueBufferOfSize(static_cast(requiredBytes / sizeof(WideChar)), false, NULL, NULL); + convert_utf8_to_widechar(asciiStr.str(), peek(), requiredBytes); + validate(); +} From e7eb7d236a22caf5f3e53ba0f6bc49f76451b5f6 Mon Sep 17 00:00:00 2001 From: Slurmlord Date: Thu, 1 Jan 2026 16:38:19 +0100 Subject: [PATCH 3/4] Use new UTF8 functions in ParseAsciiStringToGameInfo --- Core/GameEngine/Source/GameNetwork/GameInfo.cpp | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/Core/GameEngine/Source/GameNetwork/GameInfo.cpp b/Core/GameEngine/Source/GameNetwork/GameInfo.cpp index 6b3298d6ef3..7dba57c1572 100644 --- a/Core/GameEngine/Source/GameNetwork/GameInfo.cpp +++ b/Core/GameEngine/Source/GameNetwork/GameInfo.cpp @@ -44,6 +44,7 @@ #include "GameNetwork/LANAPI.h" // for testing packet size #include "GameNetwork/LANAPICallbacks.h" // for testing packet size #include "strtok_r.h" +#include "utf8.h" @@ -1032,6 +1033,11 @@ Bool ParseAsciiStringToGameInfo(GameInfo *game, AsciiString options) //DEBUG_LOG(("Saw options of %s", options.str())); DEBUG_LOG(("ParseAsciiStringToGameInfo - parsing [%s]", options.str())); + if (!options.isValidUtf8()) + { + DEBUG_LOG(("ParseAsciiStringToGameInfo - options string is not valid UTF-8")); + return false; + } while ( (keyValPair = strtok_r(bufPtr, ";", &strPos)) != NULL ) { @@ -1167,8 +1173,9 @@ Bool ParseAsciiStringToGameInfo(GameInfo *game, AsciiString options) DEBUG_LOG(("ParseAsciiStringToGameInfo - slotValue name is empty, quitting")); break; } + UnicodeString name; - name.set(MultiByteToWideCharSingleLine(slotValue.str() +1).c_str()); + name.convertFromUtf8(slotValue.str() + 1); //DEBUG_LOG(("ParseAsciiStringToGameInfo - name is %s", slotValue.str()+1)); From d83e5eea045946cedc647fbb753208192c05faff Mon Sep 17 00:00:00 2001 From: Slurmlord Date: Thu, 1 Jan 2026 17:03:10 +0100 Subject: [PATCH 4/4] Happy new year! --- Core/Libraries/Source/WWVegas/WWLib/utf8.cpp | 2 +- Core/Libraries/Source/WWVegas/WWLib/utf8.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Core/Libraries/Source/WWVegas/WWLib/utf8.cpp b/Core/Libraries/Source/WWVegas/WWLib/utf8.cpp index 28a9af5ff40..d75e7a60995 100644 --- a/Core/Libraries/Source/WWVegas/WWLib/utf8.cpp +++ b/Core/Libraries/Source/WWVegas/WWLib/utf8.cpp @@ -1,6 +1,6 @@ /* ** Command & Conquer Generals Zero Hour(tm) -** Copyright 2025 TheSuperHackers +** Copyright 2026 TheSuperHackers ** ** This program is free software: you can redistribute it and/or modify ** it under the terms of the GNU General Public License as published by diff --git a/Core/Libraries/Source/WWVegas/WWLib/utf8.h b/Core/Libraries/Source/WWVegas/WWLib/utf8.h index 85ae1b716f5..a37729e4f6c 100644 --- a/Core/Libraries/Source/WWVegas/WWLib/utf8.h +++ b/Core/Libraries/Source/WWVegas/WWLib/utf8.h @@ -1,6 +1,6 @@ /* ** Command & Conquer Generals Zero Hour(tm) -** Copyright 2025 TheSuperHackers +** Copyright 2026 TheSuperHackers ** ** This program is free software: you can redistribute it and/or modify ** it under the terms of the GNU General Public License as published by