1
0
Fork 0
mirror of https://github.com/juce-framework/JUCE.git synced 2026-01-08 23:24:19 +00:00

CharPointer_UTF16: Make behaviour consistent when iterating through unpaired surrogates

There's a few things going on in this commit:

- The implementation of CharPointer_UTF16 now uses helpers from
  CharacterFunctions to avoid a few instances of magic numbers.
- Where it makes sense, member functions of that class have been DRYed
  by e.g. implementing getAndAdvance in terms of operator*() and
  operator++().
- Added more tests for incrementing/decrementing/dereferencing
  CharPointer_UTF16.

After this change, a CharPointer_UTF16 that points to an unpaired
surrogate will always dereference to a 32-bit character with that
surrogate's value.

Note that dereferencing a CharPointer_UTF16 that points to a high
surrogate at the final code unit in a memory region is inherently
unsafe, because CharPointer_UTF16 can't track its own size, and the
dereference operation will check the following code unit to see whether
it is a low surrogate.
This commit is contained in:
reuk 2025-11-18 15:19:42 +00:00
parent c514c95797
commit 8b77aca786
No known key found for this signature in database
2 changed files with 114 additions and 21 deletions

View file

@ -90,33 +90,32 @@ public:
/** Returns the unicode character that this pointer is pointing to. */
juce_wchar operator*() const noexcept
{
auto n = (uint32) (uint16) *data;
const auto first = (uint32) (uint16) data[0];
if (n >= 0xd800 && n <= 0xdfff && ((uint32) (uint16) data[1]) >= 0xdc00)
n = 0x10000 + (((n - 0xd800) << 10) | (((uint32) (uint16) data[1]) - 0xdc00));
if ((++CharPointer_UTF16 (*this)).data - data == 1)
return (juce_wchar) first;
return (juce_wchar) n;
const auto second = (uint32) (uint16) data[1];
return (juce_wchar) (0x10000 + (((first - 0xd800) << 10) | (second - 0xdc00)));
}
/** Moves this pointer along to the next character in the string. */
CharPointer_UTF16& operator++() noexcept
{
auto n = (uint32) (uint16) *data++;
if (n >= 0xd800 && n <= 0xdfff && ((uint32) (uint16) *data) >= 0xdc00)
++data;
data += (CharacterFunctions::isHighSurrogate ((uint16) data[0])
&& CharacterFunctions::isLowSurrogate ((uint16) data[1]))
? 2
: 1;
return *this;
}
/** Moves this pointer back to the previous character in the string. */
CharPointer_UTF16& operator--() noexcept
{
auto n = (uint32) (uint16) (*--data);
if (n >= 0xdc00 && n <= 0xdfff)
--data;
data -= (CharacterFunctions::isLowSurrogate ((uint16) data[-1])
&& CharacterFunctions::isHighSurrogate ((uint16) data[-2]))
? 2
: 1;
return *this;
}
@ -124,12 +123,9 @@ public:
advances the pointer to point to the next character. */
juce_wchar getAndAdvance() noexcept
{
auto n = (uint32) (uint16) *data++;
if (n >= 0xd800 && n <= 0xdfff && ((uint32) (uint16) *data) >= 0xdc00)
n = 0x10000 + ((((n - 0xd800) << 10) | (((uint32) (uint16) *data++) - 0xdc00)));
return (juce_wchar) n;
const auto result = **this;
++(*this);
return result;
}
/** Moves this pointer along to the next character in the string. */
@ -214,7 +210,7 @@ public:
{
auto n = (uint32) (uint16) *d++;
if (n >= 0xd800 && n <= 0xdfff)
if (CharacterFunctions::isHighSurrogate ((juce_wchar) n))
{
if (*d++ == 0)
break;

View file

@ -110,6 +110,103 @@ public:
expect (CharPointer_UTF16::isValidString (string.data(), 4) == CharPointer_UTF32::canRepresent ((juce_wchar) c));
}
}
beginTest ("Iterating string starting with unpaired high surrogate produces a wide character with the surrogate value");
{
const std::vector<char16_t> stringA { 0xd800, 0xa, 0xb };
expect (rangesEqual (Span (stringA), Span (stringA)));
const std::vector<char16_t> stringB { 0xd800, 0xe000, 0xb };
expect (rangesEqual (Span (stringB), Span (stringB)));
}
beginTest ("Iterating string ending with unpaired high surrogate produces a wide character with the surrogate value");
{
const std::vector<char16_t> string { 0xa, 0xb, 0xd800, 0x0 };
expect (rangesEqual (Span (string), Span (string)));
}
beginTest ("Iterating string starting with unpaired low surrogate produces a wide character with the surrogate value");
{
const std::vector<char16_t> stringA { 0xdc00, 0xa, 0xb };
expect (rangesEqual (Span (stringA), Span (stringA)));
const std::vector<char16_t> stringB { 0xdc00, 0xe000, 0xb };
expect (rangesEqual (Span (stringB), Span (stringB)));
}
beginTest ("Iterating string ending with unpaired low surrogate produces a wide character with the surrogate value");
{
const std::vector<char16_t> string { 0xa, 0xb, 0xdc00 };
expect (rangesEqual (Span (string), Span (string)));
}
beginTest ("Iterating string with multiple unpaired surrogates produces produces wide characters with those surrogate values");
{
const std::vector<char16_t> string { 0xd800, 0xd800, 0xdc00, 0xdc00, 0xa, 0xb };
const std::vector<juce_wchar> expected { 0xd800, 0x10000, 0xdc00, 0xa, 0xb };
expect (rangesEqual (Span (expected), Span (string)));
}
beginTest ("Can decrement to unpaired low surrogate");
{
const CharPointer_UTF16::CharType chars[] { 0xa, (CharPointer_UTF16::CharType) 0xdc00, 0xb};
CharPointer_UTF16 ptr { chars + 2 };
expect (*ptr == 0xb);
--ptr;
expect (ptr == CharPointer_UTF16 { chars + 1 });
expect (*ptr == 0xdc00);
}
beginTest ("Can decrement to unpaired high surrogate");
{
const CharPointer_UTF16::CharType chars[] { 0xa, (CharPointer_UTF16::CharType) 0xd800, 0xb };
CharPointer_UTF16 ptr { chars + 2 };
expect (*ptr == 0xb);
--ptr;
expect (ptr == CharPointer_UTF16 { chars + 1 });
expect (*ptr == 0xd800);
}
beginTest ("Can decrement through surrogate pair");
{
const CharPointer_UTF16::CharType chars[] { 0xa,
(CharPointer_UTF16::CharType) 0xd800,
(CharPointer_UTF16::CharType) 0xdc00,
0xb };
CharPointer_UTF16 ptr { chars + 3 };
expect (*ptr == 0xb);
--ptr;
expect (ptr == CharPointer_UTF16 { chars + 1 });
expect (*ptr == 0x10000);
--ptr;
expect (ptr == CharPointer_UTF16 { chars });
expect (*ptr == (CharPointer_UTF16::CharType) 0xa);
}
}
private:
template <typename Expected>
static bool rangesEqual (Span<const Expected> expected, Span<const char16_t> units)
{
const auto dataPtr = reinterpret_cast<const CharPointer_UTF16::CharType*> (units.data());
std::vector<juce_wchar> converted;
for (const auto it : makeRange (CharPointer_UTF16 { dataPtr },
CharPointer_UTF16 { dataPtr + units.size() }))
{
converted.push_back (it);
}
// Some stdlibs require the arguments to std::equal to have the full complement of iterator
// member type aliases, so compare using std::vector iterators instead of CharPointer_UTF16
// directly.
return std::equal (expected.begin(), expected.end(), converted.begin(), converted.end());
}
};