mirror of
https://github.com/juce-framework/JUCE.git
synced 2026-01-08 23:24:19 +00:00
CharPointer_UTF16: Make behaviour consistent when iterating through unpaired surrogates
There's a few things going on in this commit: - The implementation of CharPointer_UTF16 now uses helpers from CharacterFunctions to avoid a few instances of magic numbers. - Where it makes sense, member functions of that class have been DRYed by e.g. implementing getAndAdvance in terms of operator*() and operator++(). - Added more tests for incrementing/decrementing/dereferencing CharPointer_UTF16. After this change, a CharPointer_UTF16 that points to an unpaired surrogate will always dereference to a 32-bit character with that surrogate's value. Note that dereferencing a CharPointer_UTF16 that points to a high surrogate at the final code unit in a memory region is inherently unsafe, because CharPointer_UTF16 can't track its own size, and the dereference operation will check the following code unit to see whether it is a low surrogate.
This commit is contained in:
parent
c514c95797
commit
8b77aca786
2 changed files with 114 additions and 21 deletions
|
|
@ -90,33 +90,32 @@ public:
|
|||
/** Returns the unicode character that this pointer is pointing to. */
|
||||
juce_wchar operator*() const noexcept
|
||||
{
|
||||
auto n = (uint32) (uint16) *data;
|
||||
const auto first = (uint32) (uint16) data[0];
|
||||
|
||||
if (n >= 0xd800 && n <= 0xdfff && ((uint32) (uint16) data[1]) >= 0xdc00)
|
||||
n = 0x10000 + (((n - 0xd800) << 10) | (((uint32) (uint16) data[1]) - 0xdc00));
|
||||
if ((++CharPointer_UTF16 (*this)).data - data == 1)
|
||||
return (juce_wchar) first;
|
||||
|
||||
return (juce_wchar) n;
|
||||
const auto second = (uint32) (uint16) data[1];
|
||||
return (juce_wchar) (0x10000 + (((first - 0xd800) << 10) | (second - 0xdc00)));
|
||||
}
|
||||
|
||||
/** Moves this pointer along to the next character in the string. */
|
||||
CharPointer_UTF16& operator++() noexcept
|
||||
{
|
||||
auto n = (uint32) (uint16) *data++;
|
||||
|
||||
if (n >= 0xd800 && n <= 0xdfff && ((uint32) (uint16) *data) >= 0xdc00)
|
||||
++data;
|
||||
|
||||
data += (CharacterFunctions::isHighSurrogate ((uint16) data[0])
|
||||
&& CharacterFunctions::isLowSurrogate ((uint16) data[1]))
|
||||
? 2
|
||||
: 1;
|
||||
return *this;
|
||||
}
|
||||
|
||||
/** Moves this pointer back to the previous character in the string. */
|
||||
CharPointer_UTF16& operator--() noexcept
|
||||
{
|
||||
auto n = (uint32) (uint16) (*--data);
|
||||
|
||||
if (n >= 0xdc00 && n <= 0xdfff)
|
||||
--data;
|
||||
|
||||
data -= (CharacterFunctions::isLowSurrogate ((uint16) data[-1])
|
||||
&& CharacterFunctions::isHighSurrogate ((uint16) data[-2]))
|
||||
? 2
|
||||
: 1;
|
||||
return *this;
|
||||
}
|
||||
|
||||
|
|
@ -124,12 +123,9 @@ public:
|
|||
advances the pointer to point to the next character. */
|
||||
juce_wchar getAndAdvance() noexcept
|
||||
{
|
||||
auto n = (uint32) (uint16) *data++;
|
||||
|
||||
if (n >= 0xd800 && n <= 0xdfff && ((uint32) (uint16) *data) >= 0xdc00)
|
||||
n = 0x10000 + ((((n - 0xd800) << 10) | (((uint32) (uint16) *data++) - 0xdc00)));
|
||||
|
||||
return (juce_wchar) n;
|
||||
const auto result = **this;
|
||||
++(*this);
|
||||
return result;
|
||||
}
|
||||
|
||||
/** Moves this pointer along to the next character in the string. */
|
||||
|
|
@ -214,7 +210,7 @@ public:
|
|||
{
|
||||
auto n = (uint32) (uint16) *d++;
|
||||
|
||||
if (n >= 0xd800 && n <= 0xdfff)
|
||||
if (CharacterFunctions::isHighSurrogate ((juce_wchar) n))
|
||||
{
|
||||
if (*d++ == 0)
|
||||
break;
|
||||
|
|
|
|||
|
|
@ -110,6 +110,103 @@ public:
|
|||
expect (CharPointer_UTF16::isValidString (string.data(), 4) == CharPointer_UTF32::canRepresent ((juce_wchar) c));
|
||||
}
|
||||
}
|
||||
|
||||
beginTest ("Iterating string starting with unpaired high surrogate produces a wide character with the surrogate value");
|
||||
{
|
||||
const std::vector<char16_t> stringA { 0xd800, 0xa, 0xb };
|
||||
expect (rangesEqual (Span (stringA), Span (stringA)));
|
||||
|
||||
const std::vector<char16_t> stringB { 0xd800, 0xe000, 0xb };
|
||||
expect (rangesEqual (Span (stringB), Span (stringB)));
|
||||
}
|
||||
|
||||
beginTest ("Iterating string ending with unpaired high surrogate produces a wide character with the surrogate value");
|
||||
{
|
||||
const std::vector<char16_t> string { 0xa, 0xb, 0xd800, 0x0 };
|
||||
expect (rangesEqual (Span (string), Span (string)));
|
||||
}
|
||||
|
||||
beginTest ("Iterating string starting with unpaired low surrogate produces a wide character with the surrogate value");
|
||||
{
|
||||
const std::vector<char16_t> stringA { 0xdc00, 0xa, 0xb };
|
||||
expect (rangesEqual (Span (stringA), Span (stringA)));
|
||||
|
||||
const std::vector<char16_t> stringB { 0xdc00, 0xe000, 0xb };
|
||||
expect (rangesEqual (Span (stringB), Span (stringB)));
|
||||
}
|
||||
|
||||
beginTest ("Iterating string ending with unpaired low surrogate produces a wide character with the surrogate value");
|
||||
{
|
||||
const std::vector<char16_t> string { 0xa, 0xb, 0xdc00 };
|
||||
expect (rangesEqual (Span (string), Span (string)));
|
||||
}
|
||||
|
||||
beginTest ("Iterating string with multiple unpaired surrogates produces produces wide characters with those surrogate values");
|
||||
{
|
||||
const std::vector<char16_t> string { 0xd800, 0xd800, 0xdc00, 0xdc00, 0xa, 0xb };
|
||||
const std::vector<juce_wchar> expected { 0xd800, 0x10000, 0xdc00, 0xa, 0xb };
|
||||
expect (rangesEqual (Span (expected), Span (string)));
|
||||
}
|
||||
|
||||
beginTest ("Can decrement to unpaired low surrogate");
|
||||
{
|
||||
const CharPointer_UTF16::CharType chars[] { 0xa, (CharPointer_UTF16::CharType) 0xdc00, 0xb};
|
||||
CharPointer_UTF16 ptr { chars + 2 };
|
||||
|
||||
expect (*ptr == 0xb);
|
||||
--ptr;
|
||||
expect (ptr == CharPointer_UTF16 { chars + 1 });
|
||||
expect (*ptr == 0xdc00);
|
||||
}
|
||||
|
||||
beginTest ("Can decrement to unpaired high surrogate");
|
||||
{
|
||||
const CharPointer_UTF16::CharType chars[] { 0xa, (CharPointer_UTF16::CharType) 0xd800, 0xb };
|
||||
CharPointer_UTF16 ptr { chars + 2 };
|
||||
|
||||
expect (*ptr == 0xb);
|
||||
--ptr;
|
||||
expect (ptr == CharPointer_UTF16 { chars + 1 });
|
||||
expect (*ptr == 0xd800);
|
||||
}
|
||||
|
||||
beginTest ("Can decrement through surrogate pair");
|
||||
{
|
||||
const CharPointer_UTF16::CharType chars[] { 0xa,
|
||||
(CharPointer_UTF16::CharType) 0xd800,
|
||||
(CharPointer_UTF16::CharType) 0xdc00,
|
||||
0xb };
|
||||
CharPointer_UTF16 ptr { chars + 3 };
|
||||
|
||||
expect (*ptr == 0xb);
|
||||
|
||||
--ptr;
|
||||
expect (ptr == CharPointer_UTF16 { chars + 1 });
|
||||
expect (*ptr == 0x10000);
|
||||
|
||||
--ptr;
|
||||
expect (ptr == CharPointer_UTF16 { chars });
|
||||
expect (*ptr == (CharPointer_UTF16::CharType) 0xa);
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
template <typename Expected>
|
||||
static bool rangesEqual (Span<const Expected> expected, Span<const char16_t> units)
|
||||
{
|
||||
const auto dataPtr = reinterpret_cast<const CharPointer_UTF16::CharType*> (units.data());
|
||||
std::vector<juce_wchar> converted;
|
||||
|
||||
for (const auto it : makeRange (CharPointer_UTF16 { dataPtr },
|
||||
CharPointer_UTF16 { dataPtr + units.size() }))
|
||||
{
|
||||
converted.push_back (it);
|
||||
}
|
||||
|
||||
// Some stdlibs require the arguments to std::equal to have the full complement of iterator
|
||||
// member type aliases, so compare using std::vector iterators instead of CharPointer_UTF16
|
||||
// directly.
|
||||
return std::equal (expected.begin(), expected.end(), converted.begin(), converted.end());
|
||||
}
|
||||
};
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue