CharPointer_UTF16: Make behaviour consistent when iterating through unpaired surrogates

There's a few things going on in this commit: - The implementation of CharPointer_UTF16 now uses helpers from CharacterFunctions to avoid a few instances of magic numbers. - Where it makes sense, member functions of that class have been DRYed by e.g. implementing getAndAdvance in terms of operator*() and operator++(). - Added more tests for incrementing/decrementing/dereferencing CharPointer_UTF16. After this change, a CharPointer_UTF16 that points to an unpaired surrogate will always dereference to a 32-bit character with that surrogate's value. Note that dereferencing a CharPointer_UTF16 that points to a high surrogate at the final code unit in a memory region is inherently unsafe, because CharPointer_UTF16 can't track its own size, and the dereference operation will check the following code unit to see whether it is a low surrogate.
2026-01-08 23:24:19 +00:00 · 2025-11-18 15:19:42 +00:00 · 2025-11-18 15:19:42 +00:00 · 8b77aca786
commit 8b77aca786
parent c514c95797
2 changed files with 114 additions and 21 deletions
--- a/modules/juce_core/text/juce_CharPointer_UTF16.h
+++ b/modules/juce_core/text/juce_CharPointer_UTF16.h
@ -90,33 +90,32 @@ public:
    /** Returns the unicode character that this pointer is pointing to. */
    juce_wchar operator*() const noexcept
    {
-        auto n = (uint32) (uint16) *data;
+        const auto first = (uint32) (uint16) data[0];

-        if (n >= 0xd800 && n <= 0xdfff && ((uint32) (uint16) data[1]) >= 0xdc00)
-            n = 0x10000 + (((n - 0xd800) << 10) | (((uint32) (uint16) data[1]) - 0xdc00));
+        if ((++CharPointer_UTF16 (*this)).data - data == 1)
+            return (juce_wchar) first;

-        return (juce_wchar) n;
+        const auto second = (uint32) (uint16) data[1];
+        return (juce_wchar) (0x10000 + (((first - 0xd800) << 10) | (second - 0xdc00)));
    }

    /** Moves this pointer along to the next character in the string. */
    CharPointer_UTF16& operator++() noexcept
    {
-        auto n = (uint32) (uint16) *data++;
-
-        if (n >= 0xd800 && n <= 0xdfff && ((uint32) (uint16) *data) >= 0xdc00)
-            ++data;
-
+        data += (CharacterFunctions::isHighSurrogate ((uint16) data[0])
+                 && CharacterFunctions::isLowSurrogate ((uint16) data[1]))
+              ? 2
+              : 1;
        return *this;
    }

    /** Moves this pointer back to the previous character in the string. */
    CharPointer_UTF16& operator--() noexcept
    {
-        auto n = (uint32) (uint16) (*--data);
-
-        if (n >= 0xdc00 && n <= 0xdfff)
-            --data;
-
+        data -= (CharacterFunctions::isLowSurrogate ((uint16) data[-1])
+                 && CharacterFunctions::isHighSurrogate ((uint16) data[-2]))
+              ? 2
+              : 1;
        return *this;
    }

@ -124,12 +123,9 @@ public:
        advances the pointer to point to the next character. */
    juce_wchar getAndAdvance() noexcept
    {
-        auto n = (uint32) (uint16) *data++;
-
-        if (n >= 0xd800 && n <= 0xdfff && ((uint32) (uint16) *data) >= 0xdc00)
-            n = 0x10000 + ((((n - 0xd800) << 10) | (((uint32) (uint16) *data++) - 0xdc00)));
-
-        return (juce_wchar) n;
+        const auto result = **this;
+        ++(*this);
+        return result;
    }

    /** Moves this pointer along to the next character in the string. */
@ -214,7 +210,7 @@ public:
        {
            auto n = (uint32) (uint16) *d++;

-            if (n >= 0xd800 && n <= 0xdfff)
+            if (CharacterFunctions::isHighSurrogate ((juce_wchar) n))
            {
                if (*d++ == 0)
                    break;
--- a/modules/juce_core/text/juce_CharPointer_UTF16_test.cpp
+++ b/modules/juce_core/text/juce_CharPointer_UTF16_test.cpp
@ -110,6 +110,103 @@ public:
                expect (CharPointer_UTF16::isValidString (string.data(), 4) == CharPointer_UTF32::canRepresent ((juce_wchar) c));
            }
        }
+
+        beginTest ("Iterating string starting with unpaired high surrogate produces a wide character with the surrogate value");
+        {
+            const std::vector<char16_t> stringA { 0xd800, 0xa, 0xb };
+            expect (rangesEqual (Span (stringA), Span (stringA)));
+
+            const std::vector<char16_t> stringB { 0xd800, 0xe000, 0xb };
+            expect (rangesEqual (Span (stringB), Span (stringB)));
+        }
+
+        beginTest ("Iterating string ending with unpaired high surrogate produces a wide character with the surrogate value");
+        {
+            const std::vector<char16_t> string { 0xa, 0xb, 0xd800, 0x0 };
+            expect (rangesEqual (Span (string), Span (string)));
+        }
+
+        beginTest ("Iterating string starting with unpaired low surrogate produces a wide character with the surrogate value");
+        {
+            const std::vector<char16_t> stringA { 0xdc00, 0xa, 0xb };
+            expect (rangesEqual (Span (stringA), Span (stringA)));
+
+            const std::vector<char16_t> stringB { 0xdc00, 0xe000, 0xb };
+            expect (rangesEqual (Span (stringB), Span (stringB)));
+        }
+
+        beginTest ("Iterating string ending with unpaired low surrogate produces a wide character with the surrogate value");
+        {
+            const std::vector<char16_t> string { 0xa, 0xb, 0xdc00 };
+            expect (rangesEqual (Span (string), Span (string)));
+        }
+
+        beginTest ("Iterating string with multiple unpaired surrogates produces produces wide characters with those surrogate values");
+        {
+            const std::vector<char16_t> string { 0xd800, 0xd800, 0xdc00, 0xdc00, 0xa, 0xb };
+            const std::vector<juce_wchar> expected { 0xd800, 0x10000, 0xdc00, 0xa, 0xb };
+            expect (rangesEqual (Span (expected), Span (string)));
+        }
+
+        beginTest ("Can decrement to unpaired low surrogate");
+        {
+            const CharPointer_UTF16::CharType chars[] { 0xa, (CharPointer_UTF16::CharType) 0xdc00, 0xb};
+            CharPointer_UTF16 ptr { chars + 2 };
+
+            expect (*ptr == 0xb);
+            --ptr;
+            expect (ptr == CharPointer_UTF16 { chars + 1 });
+            expect (*ptr == 0xdc00);
+        }
+
+        beginTest ("Can decrement to unpaired high surrogate");
+        {
+            const CharPointer_UTF16::CharType chars[] { 0xa, (CharPointer_UTF16::CharType) 0xd800, 0xb };
+            CharPointer_UTF16 ptr { chars + 2 };
+
+            expect (*ptr == 0xb);
+            --ptr;
+            expect (ptr == CharPointer_UTF16 { chars + 1 });
+            expect (*ptr == 0xd800);
+        }
+
+        beginTest ("Can decrement through surrogate pair");
+        {
+            const CharPointer_UTF16::CharType chars[] { 0xa,
+                                                        (CharPointer_UTF16::CharType) 0xd800,
+                                                        (CharPointer_UTF16::CharType) 0xdc00,
+                                                        0xb };
+            CharPointer_UTF16 ptr { chars + 3 };
+
+            expect (*ptr == 0xb);
+
+            --ptr;
+            expect (ptr == CharPointer_UTF16 { chars + 1 });
+            expect (*ptr == 0x10000);
+
+            --ptr;
+            expect (ptr == CharPointer_UTF16 { chars });
+            expect (*ptr == (CharPointer_UTF16::CharType) 0xa);
+        }
+    }
+
+private:
+    template <typename Expected>
+    static bool rangesEqual (Span<const Expected> expected, Span<const char16_t> units)
+    {
+        const auto dataPtr = reinterpret_cast<const CharPointer_UTF16::CharType*> (units.data());
+        std::vector<juce_wchar> converted;
+
+        for (const auto it : makeRange (CharPointer_UTF16 { dataPtr },
+                                        CharPointer_UTF16 { dataPtr + units.size() }))
+        {
+            converted.push_back (it);
+        }
+
+        // Some stdlibs require the arguments to std::equal to have the full complement of iterator
+        // member type aliases, so compare using std::vector iterators instead of CharPointer_UTF16
+        // directly.
+        return std::equal (expected.begin(), expected.end(), converted.begin(), converted.end());
    }
 };