From f0560cefbb101f4d2aea159de74a93e7db5722e6 Mon Sep 17 00:00:00 2001 From: reuk Date: Wed, 19 Nov 2025 17:54:20 +0000 Subject: [PATCH] String: Fix mangled decoding of UTF-16 strings containing surrogate pairs in createStringFromData() --- modules/juce_core/text/juce_String.cpp | 68 ++++++++++++++++++++------ 1 file changed, 54 insertions(+), 14 deletions(-) diff --git a/modules/juce_core/text/juce_String.cpp b/modules/juce_core/text/juce_String.cpp index 3871e30d30..029525a771 100644 --- a/modules/juce_core/text/juce_String.cpp +++ b/modules/juce_core/text/juce_String.cpp @@ -1994,24 +1994,25 @@ String String::createStringFromData (const void* const unknownData, int size) if (size == 1) return charToString ((juce_wchar) data[0]); - if (CharPointer_UTF16::isByteOrderMarkBigEndian (data) - || CharPointer_UTF16::isByteOrderMarkLittleEndian (data)) + const auto bigEndianData = CharPointer_UTF16::isByteOrderMarkBigEndian (data); + + if (bigEndianData || CharPointer_UTF16::isByteOrderMarkLittleEndian (data)) { - const int numChars = size / 2 - 1; + const auto numUnits = size / 2 - 1; + const auto src = unalignedPointerCast (data + 2); + const auto swapBytes = bigEndianData ? ByteOrder::swapIfLittleEndian + : ByteOrder::swapIfBigEndian; - StringCreationHelper builder ((size_t) numChars); + StringCreationHelper builder ((size_t) numUnits); - auto src = unalignedPointerCast (data + 2); - - if (CharPointer_UTF16::isByteOrderMarkBigEndian (data)) + for (int i = 0; i < numUnits;) { - for (int i = 0; i < numChars; ++i) - builder.write ((juce_wchar) ByteOrder::swapIfLittleEndian (src[i])); - } - else - { - for (int i = 0; i < numChars; ++i) - builder.write ((juce_wchar) ByteOrder::swapIfBigEndian (src[i])); + const uint16 wideBuffer[] { swapBytes (src[i]), + swapBytes ((i + 1 == numUnits) ? (uint16) 0 : src[i + 1]) }; + const CharPointer_UTF16 ptr { reinterpret_cast (wideBuffer) }; + + builder.write (*ptr); + i += (int) ((ptr + 1).getAddress() - ptr.getAddress()); } builder.write (0); @@ -3011,6 +3012,45 @@ public: for (auto c : str) expectEquals (c, parts[index++]); } + + const CharPointer_UTF8 expectedString { "glass \xc2\xbd full" }; + const CharPointer_UTF8 emojiExpectedString { "hello JUCE \xf0\x9f\xa7\x83" }; + + beginTest ("createStringFromData reads LE UTF-16"); + { + constexpr char buffer[] = "\xff\xfe\x67\x00\x6c\x00\x61\x00\x73\x00\x73\x00\x20\x00\xbd\x00\x20\x00\x66\x00\x75\x00\x6c\x00\x6c\x00"; + expect (expectedString == String::createStringFromData (buffer, sizeof (buffer))); + + constexpr char emojiBuffer[] = "\xff\xfe\x68\x00\x65\x00\x6c\x00\x6c\x00\x6f\x00\x20\x00\x4a\x00\x55\x00\x43\x00\x45\x00\x20\x00\x3e\xd8\xc3\xdd"; + const auto emojiActualString = String::createStringFromData (emojiBuffer, sizeof (emojiBuffer)); + expect (emojiExpectedString == emojiActualString); + } + + beginTest ("createStringFromData reads BE UTF-16"); + { + constexpr char buffer[] = "\xfe\xff\x00\x67\x00\x6c\x00\x61\x00\x73\x00\x73\x00\x20\x00\xbd\x00\x20\x00\x66\x00\x75\x00\x6c\x00\x6c"; + expect (expectedString == String::createStringFromData (buffer, sizeof (buffer))); + + constexpr char emojiBuffer[] = "\xfe\xff\x00\x68\x00\x65\x00\x6c\x00\x6c\x00\x6f\x00\x20\x00\x4a\x00\x55\x00\x43\x00\x45\x00\x20\xd8\x3e\xdd\xc3"; + const auto emojiActualString = String::createStringFromData (emojiBuffer, sizeof (emojiBuffer)); + expect (emojiExpectedString == emojiActualString); + } + + beginTest ("createStringFromData reads UTF-8"); + { + constexpr char buffer[] = "glass \xc2\xbd full"; + expect (expectedString == String::createStringFromData (buffer, sizeof (buffer))); + + constexpr char emojiBuffer[] = "hello JUCE \xf0\x9f\xa7\x83"; + const auto emojiActualString = String::createStringFromData (emojiBuffer, sizeof (emojiBuffer)); + expect (emojiExpectedString == emojiActualString); + } + + beginTest ("createStringFromData reads Windows 1252"); + { + constexpr char buffer[] = "glass \xBD full"; + expect (expectedString == String::createStringFromData (buffer, sizeof (buffer))); + } } };