Represent tab characters with non-breaking space during shaping

This avoids assertions raised when shaping text containing tabs.
2026-01-10 23:44:24 +00:00 · 2024-06-19 14:37:53 +02:00 · 2024-06-19 14:37:53 +02:00 · ef8417023e
commit ef8417023e
parent 0d8f2c63ec
2 changed files with 91 additions and 32 deletions
--- a/BREAKING_CHANGES.md
+++ b/BREAKING_CHANGES.md
@ -1,5 +1,35 @@
 # JUCE breaking changes

+# develop
+
+## Change
+
+The tab width when rendering text with the GlyphArrangement and TextLayout
+classes now equals the width of a space. Previously it equaled the width of a
+tofu character used for missing glyphs.
+
+**Possible Issues**
+
+User interfaces using the GlyphArrangement and TextLayout classes directly to
+render text containing tabs will look differently. The TextEditor and
+CodeEditorComponent classes have special logic for replacing the tabs prior to
+rendering, and consequently, these are not affected.
+
+**Workaround**
+
+Replace the tab characters prior to rendering and substitute them with the
+required number of non-breaking spaces.
+
+**Rationale**
+
+Since the Unicode related revamping of JUCE's text rendering classes, tab
+characters would raise assertions and would be rendered with the tofu glyph.
+This change visually treats tab characters as non-breaking spaces. Since the
+JUCE 7 behaviour of using the tofu glyph's width was not a conscious decision,
+but rather a side effect of ignoring unresolved glyphs, using a default width
+of one space is more reasonable.
+
+
 # Version 8.0.0

 ## Change
--- a/modules/juce_graphics/fonts/juce_SimpleShapedText.cpp
+++ b/modules/juce_graphics/fonts/juce_SimpleShapedText.cpp
@ -330,6 +330,49 @@ private:
    size_t beyondEnd{};
 };

+enum class ControlCharacter
+{
+    crFollowedByLf,
+    cr,
+    lf,
+    tab
+};
+
+static auto findControlCharacters (Span<juce_wchar> text)
+{
+    constexpr juce_wchar lf = 0x0a;
+    constexpr juce_wchar cr = 0x0d;
+    constexpr juce_wchar tab = 0x09;
+
+    std::map<size_t, ControlCharacter> result;
+
+    const auto iMax = text.size();
+
+    for (const auto [i, c] : enumerate (text, size_t{}))
+    {
+        if (c == lf)
+        {
+            result[i] = ControlCharacter::lf;
+            continue;
+        }
+
+        if (c == cr)
+        {
+            if (iMax - i > 1 && text[i + 1] == lf)
+                result[i] = ControlCharacter::crFollowedByLf;
+            else
+                result[i] = ControlCharacter::cr;
+
+            continue;
+        }
+
+        if (c == tab)
+            result[i] = ControlCharacter::tab;
+    }
+
+    return result;
+}
+
 /*  Returns glyphs in logical order as that favours wrapping. */
 static std::vector<ShapedGlyph> lowLevelShape (const String& string,
                                               Range<int64> range,
@ -358,46 +401,32 @@ static std::vector<ShapedGlyph> lowLevelShape (const String& string,
                        0,
                        0);

-    // Adding the converted portion of the text with hb_buffer_add_utf32() or especially with
-    // hb_buffer_add() gives us control over cluster numbers. hb_buffer_add_utf32() will increment
-    // cluster numbers by unicode codepoints (as opposed to UTF8 bytes) starting from 0.
-    auto utf32Span = Span { string.toUTF32().getAddress() + (size_t) range.getStart(),
-                            (size_t) range.getLength() };
+    const Span utf32Span { string.toUTF32().getAddress() + (size_t) range.getStart(),
+                           (size_t) range.getLength() };

-    // We're using a word joiner (zero width non-breaking space) followed by a non-breaking space
-    // for visual representation. This is so that it's not possible to break the glyph representing
-    // the line breaking glyph on its own.
-    static constexpr uint32_t crLf[] = { 0x2060, 0x00A0 };
+    const auto controlChars = findControlCharacters (utf32Span);
+    auto nextControlChar = controlChars.begin();

-    const auto numLineEndsToReplace = [&]
+    for (const auto pair : enumerate (utf32Span, size_t{}))
    {
-        constexpr auto lf = 0x0a;
-        constexpr auto cr = 0x0d;
-
-        if (! utf32Span.empty() && (utf32Span.back() == lf || utf32Span.back() == cr))
+        const auto charToAdd = [&]
        {
-            if (utf32Span.size() >= 2 && utf32Span[utf32Span.size() - 2] == cr)
-                return 2;
+            if (nextControlChar == controlChars.end() || pair.index != nextControlChar->first)
+                return pair.value;

-            return 1;
-        }
+            constexpr juce_wchar wordJoiner       = 0x2060;
+            constexpr juce_wchar nonBreakingSpace = 0x00a0;

-        return 0;
-    }();
+            const auto replacement = nextControlChar->second == ControlCharacter::crFollowedByLf
+                                   ? wordJoiner
+                                   : nonBreakingSpace;

-    hb_buffer_add_utf32 (buffer.get(),
-                         (uint32_t*) utf32Span.data(),
-                         (int) range.getLength() - numLineEndsToReplace,
-                         (unsigned int) 0,
-                         (int) range.getLength() - numLineEndsToReplace);
+            ++nextControlChar;

-    for (int i = 0; i < numLineEndsToReplace; ++i)
-    {
-        // The following gets cluster values right, but this does not follow clearly from harfbuzz documentation.
-        // Add at least a regression test checking the correctness of cluster values.
-        hb_buffer_add (buffer.get(),
-                       static_cast<hb_codepoint_t> (*(crLf + (2 - numLineEndsToReplace) + i)),
-                       (unsigned int) ((int) range.getLength() - numLineEndsToReplace + i));
+            return replacement;
+        }();
+
+        hb_buffer_add (buffer.get(), static_cast<hb_codepoint_t> (charToAdd), (unsigned int) pair.index);
    }

    const auto postContextByteRange = utf8Lookup.getByteRange (Range<int64> { range.getEnd(), (int64) string.length() });