From 8b77aca78629d36e9df96a0a7a94e464646945ae Mon Sep 17 00:00:00 2001 From: reuk Date: Tue, 18 Nov 2025 15:19:42 +0000 Subject: [PATCH] CharPointer_UTF16: Make behaviour consistent when iterating through unpaired surrogates There's a few things going on in this commit: - The implementation of CharPointer_UTF16 now uses helpers from CharacterFunctions to avoid a few instances of magic numbers. - Where it makes sense, member functions of that class have been DRYed by e.g. implementing getAndAdvance in terms of operator*() and operator++(). - Added more tests for incrementing/decrementing/dereferencing CharPointer_UTF16. After this change, a CharPointer_UTF16 that points to an unpaired surrogate will always dereference to a 32-bit character with that surrogate's value. Note that dereferencing a CharPointer_UTF16 that points to a high surrogate at the final code unit in a memory region is inherently unsafe, because CharPointer_UTF16 can't track its own size, and the dereference operation will check the following code unit to see whether it is a low surrogate. --- .../juce_core/text/juce_CharPointer_UTF16.h | 38 ++++---- .../text/juce_CharPointer_UTF16_test.cpp | 97 +++++++++++++++++++ 2 files changed, 114 insertions(+), 21 deletions(-) diff --git a/modules/juce_core/text/juce_CharPointer_UTF16.h b/modules/juce_core/text/juce_CharPointer_UTF16.h index dd5c49d39f..457fa72598 100644 --- a/modules/juce_core/text/juce_CharPointer_UTF16.h +++ b/modules/juce_core/text/juce_CharPointer_UTF16.h @@ -90,33 +90,32 @@ public: /** Returns the unicode character that this pointer is pointing to. */ juce_wchar operator*() const noexcept { - auto n = (uint32) (uint16) *data; + const auto first = (uint32) (uint16) data[0]; - if (n >= 0xd800 && n <= 0xdfff && ((uint32) (uint16) data[1]) >= 0xdc00) - n = 0x10000 + (((n - 0xd800) << 10) | (((uint32) (uint16) data[1]) - 0xdc00)); + if ((++CharPointer_UTF16 (*this)).data - data == 1) + return (juce_wchar) first; - return (juce_wchar) n; + const auto second = (uint32) (uint16) data[1]; + return (juce_wchar) (0x10000 + (((first - 0xd800) << 10) | (second - 0xdc00))); } /** Moves this pointer along to the next character in the string. */ CharPointer_UTF16& operator++() noexcept { - auto n = (uint32) (uint16) *data++; - - if (n >= 0xd800 && n <= 0xdfff && ((uint32) (uint16) *data) >= 0xdc00) - ++data; - + data += (CharacterFunctions::isHighSurrogate ((uint16) data[0]) + && CharacterFunctions::isLowSurrogate ((uint16) data[1])) + ? 2 + : 1; return *this; } /** Moves this pointer back to the previous character in the string. */ CharPointer_UTF16& operator--() noexcept { - auto n = (uint32) (uint16) (*--data); - - if (n >= 0xdc00 && n <= 0xdfff) - --data; - + data -= (CharacterFunctions::isLowSurrogate ((uint16) data[-1]) + && CharacterFunctions::isHighSurrogate ((uint16) data[-2])) + ? 2 + : 1; return *this; } @@ -124,12 +123,9 @@ public: advances the pointer to point to the next character. */ juce_wchar getAndAdvance() noexcept { - auto n = (uint32) (uint16) *data++; - - if (n >= 0xd800 && n <= 0xdfff && ((uint32) (uint16) *data) >= 0xdc00) - n = 0x10000 + ((((n - 0xd800) << 10) | (((uint32) (uint16) *data++) - 0xdc00))); - - return (juce_wchar) n; + const auto result = **this; + ++(*this); + return result; } /** Moves this pointer along to the next character in the string. */ @@ -214,7 +210,7 @@ public: { auto n = (uint32) (uint16) *d++; - if (n >= 0xd800 && n <= 0xdfff) + if (CharacterFunctions::isHighSurrogate ((juce_wchar) n)) { if (*d++ == 0) break; diff --git a/modules/juce_core/text/juce_CharPointer_UTF16_test.cpp b/modules/juce_core/text/juce_CharPointer_UTF16_test.cpp index defbfd8733..a1f6f5f4d7 100644 --- a/modules/juce_core/text/juce_CharPointer_UTF16_test.cpp +++ b/modules/juce_core/text/juce_CharPointer_UTF16_test.cpp @@ -110,6 +110,103 @@ public: expect (CharPointer_UTF16::isValidString (string.data(), 4) == CharPointer_UTF32::canRepresent ((juce_wchar) c)); } } + + beginTest ("Iterating string starting with unpaired high surrogate produces a wide character with the surrogate value"); + { + const std::vector stringA { 0xd800, 0xa, 0xb }; + expect (rangesEqual (Span (stringA), Span (stringA))); + + const std::vector stringB { 0xd800, 0xe000, 0xb }; + expect (rangesEqual (Span (stringB), Span (stringB))); + } + + beginTest ("Iterating string ending with unpaired high surrogate produces a wide character with the surrogate value"); + { + const std::vector string { 0xa, 0xb, 0xd800, 0x0 }; + expect (rangesEqual (Span (string), Span (string))); + } + + beginTest ("Iterating string starting with unpaired low surrogate produces a wide character with the surrogate value"); + { + const std::vector stringA { 0xdc00, 0xa, 0xb }; + expect (rangesEqual (Span (stringA), Span (stringA))); + + const std::vector stringB { 0xdc00, 0xe000, 0xb }; + expect (rangesEqual (Span (stringB), Span (stringB))); + } + + beginTest ("Iterating string ending with unpaired low surrogate produces a wide character with the surrogate value"); + { + const std::vector string { 0xa, 0xb, 0xdc00 }; + expect (rangesEqual (Span (string), Span (string))); + } + + beginTest ("Iterating string with multiple unpaired surrogates produces produces wide characters with those surrogate values"); + { + const std::vector string { 0xd800, 0xd800, 0xdc00, 0xdc00, 0xa, 0xb }; + const std::vector expected { 0xd800, 0x10000, 0xdc00, 0xa, 0xb }; + expect (rangesEqual (Span (expected), Span (string))); + } + + beginTest ("Can decrement to unpaired low surrogate"); + { + const CharPointer_UTF16::CharType chars[] { 0xa, (CharPointer_UTF16::CharType) 0xdc00, 0xb}; + CharPointer_UTF16 ptr { chars + 2 }; + + expect (*ptr == 0xb); + --ptr; + expect (ptr == CharPointer_UTF16 { chars + 1 }); + expect (*ptr == 0xdc00); + } + + beginTest ("Can decrement to unpaired high surrogate"); + { + const CharPointer_UTF16::CharType chars[] { 0xa, (CharPointer_UTF16::CharType) 0xd800, 0xb }; + CharPointer_UTF16 ptr { chars + 2 }; + + expect (*ptr == 0xb); + --ptr; + expect (ptr == CharPointer_UTF16 { chars + 1 }); + expect (*ptr == 0xd800); + } + + beginTest ("Can decrement through surrogate pair"); + { + const CharPointer_UTF16::CharType chars[] { 0xa, + (CharPointer_UTF16::CharType) 0xd800, + (CharPointer_UTF16::CharType) 0xdc00, + 0xb }; + CharPointer_UTF16 ptr { chars + 3 }; + + expect (*ptr == 0xb); + + --ptr; + expect (ptr == CharPointer_UTF16 { chars + 1 }); + expect (*ptr == 0x10000); + + --ptr; + expect (ptr == CharPointer_UTF16 { chars }); + expect (*ptr == (CharPointer_UTF16::CharType) 0xa); + } + } + +private: + template + static bool rangesEqual (Span expected, Span units) + { + const auto dataPtr = reinterpret_cast (units.data()); + std::vector converted; + + for (const auto it : makeRange (CharPointer_UTF16 { dataPtr }, + CharPointer_UTF16 { dataPtr + units.size() })) + { + converted.push_back (it); + } + + // Some stdlibs require the arguments to std::equal to have the full complement of iterator + // member type aliases, so compare using std::vector iterators instead of CharPointer_UTF16 + // directly. + return std::equal (expected.begin(), expected.end(), converted.begin(), converted.end()); } };