From 1e5c88899eba2aac16b687784d7d9c8bf066ab27 Mon Sep 17 00:00:00 2001
From: Anthony Nicholls <anthony@juce.com>
Date: Wed, 17 Jul 2024 19:59:24 +0100
Subject: [PATCH] JSON: Use UTF8 encoding by default

---
 BREAKING_CHANGES.md                           |  36 +++
 .../containers/juce_DynamicObject.cpp         |   2 +-
 modules/juce_core/javascript/juce_JSON.cpp    | 288 ++++++++++++++----
 modules/juce_core/javascript/juce_JSON.h      |  33 +-
 4 files changed, 288 insertions(+), 71 deletions(-)

diff --git a/BREAKING_CHANGES.md b/BREAKING_CHANGES.md
index 07a2ea5244..a73cf4ed3c 100644
--- a/BREAKING_CHANGES.md
+++ b/BREAKING_CHANGES.md
@@ -87,6 +87,42 @@ invokeMethod() non-virtual forces users to add methods with setMethod() instead
 of overriding invokeMethod(), which is more compatible with QuickJS.
 
 
+## Change
+
+The default JSON encoding has changed from ASCII escape sequences to UTF-8.
+
+**Possible Issues**
+
+JSON text exchanged with a non-standard compliant parser expecting ASCII
+encoding, may fail to parse UTF-8 encoded JSON files. Reliance on the raw JSON
+encoded string literal, for example for file comparison, Base64 encoding, or any
+encryption, may result in false negatives for JSON data containing the same data
+between versions of JUCE.
+
+Note: JSON files that only ever encoded ASCII text will NOT be effected.
+
+**Workaround**
+
+Use the `JSON::writeToStream()` or `JSON::toString()` functions that take a
+`FormatOptions` parameter and call `withEncoding (JSON::Encoding::ascii)` on the
+`FormatOptions` object.
+
+**Rationale**
+
+RFC 8259 states
+
+> JSON text exchanged between systems that are not part of a closed ecosystem
+MUST be encoded using UTF-8 [RFC3629].
+>
+> Previous specifications of JSON have not required the use of UTF-8 when
+transmitting JSON text.  However, the vast majority of JSON-based software
+implementations have chosen to use the UTF-8 encoding, to the extent that it is
+the only encoding that achieves interoperability.
+
+For this reason UTF-8 encoding has better interoperability than ASCII escape
+sequences.
+
+
 ## Change
 
 The ASCII and Unicode BEL character (U+0007) escape sequence has changed in the
diff --git a/modules/juce_core/containers/juce_DynamicObject.cpp b/modules/juce_core/containers/juce_DynamicObject.cpp
index 0a1441d463..a948d7ea38 100644
--- a/modules/juce_core/containers/juce_DynamicObject.cpp
+++ b/modules/juce_core/containers/juce_DynamicObject.cpp
@@ -120,7 +120,7 @@ void DynamicObject::writeAsJSON (OutputStream& out, const JSON::FormatOptions& f
             JSONFormatter::writeSpaces (out, format.getIndentLevel() + JSONFormatter::indentSize);
 
         out << '"';
-        JSONFormatter::writeString (out, properties.getName (i));
+        JSONFormatter::writeString (out, properties.getName (i), format.getEncoding());
         out << "\":";
 
         if (format.getSpacing() != JSON::Spacing::none)
diff --git a/modules/juce_core/javascript/juce_JSON.cpp b/modules/juce_core/javascript/juce_JSON.cpp
index 44a677549a..ddf2077d7c 100644
--- a/modules/juce_core/javascript/juce_JSON.cpp
+++ b/modules/juce_core/javascript/juce_JSON.cpp
@@ -92,6 +92,69 @@ struct JSONParser
         return {};
     }
 
+    int parseHexDigit()
+    {
+        const auto digitValue = CharacterFunctions::getHexDigitValue (readChar());
+
+        if (digitValue < 0)
+            throwError ("Invalid hex character", currentLocation - 1);
+
+        return digitValue;
+    }
+
+    CharPointer_UTF16::CharType parseCodeUnit()
+    {
+        return (CharPointer_UTF16::CharType) (   parseHexDigit() << 12
+                                              | (parseHexDigit() << 8)
+                                              | (parseHexDigit() << 4)
+                                              | (parseHexDigit()));
+    }
+
+    static constexpr juce_wchar asCodePoint (CharPointer_UTF16::CharType codeUnit)
+    {
+        return (juce_wchar) (uint32) (uint16) codeUnit;
+    }
+
+    CharPointer_UTF16::CharType parseLowSurrogateCodeUnit()
+    {
+        const auto errorLocation = currentLocation;
+
+        const auto throwLowSurrogateError = [&]()
+        {
+            throwError ("Expected UTF-16 low surrogate", errorLocation);
+        };
+
+        if (readChar() != '\\' || readChar() != 'u')
+            throwLowSurrogateError();
+
+        const auto lowSurrogate = parseCodeUnit();
+
+        if (! CharacterFunctions::isLowSurrogate (asCodePoint (lowSurrogate)))
+            throwLowSurrogateError();
+
+        return lowSurrogate;
+    }
+
+    juce_wchar parseEscapeSequence()
+    {
+        const auto errorLocation = currentLocation - 2;
+
+        const auto codeUnits = [&]() -> std::array<CharPointer_UTF16::CharType, 2>
+        {
+            const auto firstCodeUnit = parseCodeUnit();
+
+            if (CharacterFunctions::isNonSurrogateCodePoint (asCodePoint (firstCodeUnit)))
+                return { firstCodeUnit, 0 };
+
+            if (! CharacterFunctions::isHighSurrogate (asCodePoint (firstCodeUnit)))
+                throwError ("Invalid UTF-16 escape sequence", errorLocation);
+
+            return { firstCodeUnit, parseLowSurrogateCodeUnit() };
+        }();
+
+        return CharPointer_UTF16 (codeUnits.data()).getAndAdvance();
+    }
+
     String parseString (const juce_wchar quoteChar)
     {
         MemoryOutputStream buffer (256);
@@ -105,7 +168,6 @@ struct JSONParser
 
             if (c == '\\')
             {
-                auto errorLocation = currentLocation;
                 c = readChar();
 
                 switch (c)
@@ -113,33 +175,18 @@ struct JSONParser
                     case '"':
                     case '\'':
                     case '\\':
-                    case '/':  break;
+                    case '/': break;
 
-                    case 'a':  c = '\a'; break;
-                    case 'b':  c = '\b'; break;
-                    case 'f':  c = '\f'; break;
-                    case 'n':  c = '\n'; break;
-                    case 'r':  c = '\r'; break;
-                    case 't':  c = '\t'; break;
+                    case 'a': c = '\a'; break;
+                    case 'b': c = '\b'; break;
+                    case 'f': c = '\f'; break;
+                    case 'n': c = '\n'; break;
+                    case 'r': c = '\r'; break;
+                    case 't': c = '\t'; break;
 
-                    case 'u':
-                    {
-                        c = 0;
+                    case 'u': c = parseEscapeSequence(); break;
 
-                        for (int i = 4; --i >= 0;)
-                        {
-                            auto digitValue = CharacterFunctions::getHexDigitValue (readChar());
-
-                            if (digitValue < 0)
-                                throwError ("Syntax error in unicode escape sequence", errorLocation);
-
-                            c = (juce_wchar) ((c << 4) + static_cast<juce_wchar> (digitValue));
-                        }
-
-                        break;
-                    }
-
-                    default:  break;
+                    default: break;
                 }
             }
 
@@ -323,15 +370,15 @@ struct JSONFormatter
         out << "\\u" << String::toHexString ((int) value).paddedLeft ('0', 4);
     }
 
-    static void writeString (OutputStream& out, String::CharPointerType t)
+    static void writeString (OutputStream& out, String::CharPointerType t, JSON::Encoding encoding)
     {
         for (;;)
         {
-            auto c = t.getAndAdvance();
+            const auto c = t.getAndAdvance();
 
             switch (c)
             {
-                case 0:  return;
+                case 0: return;
 
                 case '\"': out << "\\\""; break;
                 case '\\': out << "\\\\"; break;
@@ -342,27 +389,42 @@ struct JSONFormatter
                 case '\n': out << "\\n";  break;
 
                 default:
-                    if (c >= 32 && c < 127)
+                    if (CharacterFunctions::isAsciiControlCharacter (c))
                     {
-                        out << (char) c;
+                        writeEscapedChar (out, (unsigned short) c);
                     }
                     else
                     {
-                        if (CharPointer_UTF16::getBytesRequiredFor (c) > 2)
+                        switch (encoding)
                         {
-                            CharPointer_UTF16::CharType chars[2];
-                            CharPointer_UTF16 utf16 (chars);
-                            utf16.write (c);
+                            case JSON::Encoding::utf8:
+                                out << String::charToString (c);
+                                break;
 
-                            for (int i = 0; i < 2; ++i)
-                                writeEscapedChar (out, (unsigned short) chars[i]);
-                        }
-                        else
-                        {
-                            writeEscapedChar (out, (unsigned short) c);
+                            case JSON::Encoding::ascii:
+                                if (CharacterFunctions::isAscii (c))
+                                {
+                                    out << String::charToString (c);
+                                }
+                                else if (CharacterFunctions::isPartOfBasicMultilingualPlane (c))
+                                {
+                                    if (CharacterFunctions::isNonSurrogateCodePoint (c))
+                                        writeEscapedChar (out, (unsigned short) c);
+                                    else
+                                        jassertfalse; // Illegal unicode character
+                                }
+                                else
+                                {
+                                    CharPointer_UTF16::CharType codeUnits[2] = {};
+                                    CharPointer_UTF16 utf16 (codeUnits);
+                                    utf16.write (c);
+
+                                    for (auto& codeUnit : codeUnits)
+                                        writeEscapedChar (out, (unsigned short) codeUnit);
+                                }
+                                break;
                         }
                     }
-
                     break;
             }
         }
@@ -420,7 +482,7 @@ void JSON::writeToStream (OutputStream& out, const var& v, const FormatOptions&
     if (v.isString())
     {
         out << '"';
-        JSONFormatter::writeString (out, v.toString().getCharPointer());
+        JSONFormatter::writeString (out, v.toString().getCharPointer(), opt.getEncoding());
         out << '"';
     }
     else if (v.isVoid())
@@ -536,7 +598,7 @@ void JSON::writeToStream (OutputStream& output, const var& data, const bool allO
 String JSON::escapeString (StringRef s)
 {
     MemoryOutputStream mo;
-    JSONFormatter::writeString (mo, s.text);
+    JSONFormatter::writeString (mo, s.text, Encoding::ascii);
     return mo.toString();
 }
 
@@ -650,11 +712,126 @@ public:
         }
     }
 
+    void expectCharacterEncoding (juce_wchar character, const String& expectedOutput, JSON::Encoding encoding)
+    {
+        const auto input = String::charToString (character);
+        const auto quotedOutput = '"' + expectedOutput + '"';
+        expectEquals (JSON::toString (input, JSON::FormatOptions{}.withEncoding (encoding)), quotedOutput);
+        expectEquals (JSON::fromString (quotedOutput).toString(), input);
+    }
+
+    void expectNoEscapeSequence (juce_wchar input)
+    {
+        const auto inputString = String::charToString (input);
+        expectCharacterEncoding (input, inputString, JSON::Encoding::ascii);
+        expectCharacterEncoding (input, inputString, JSON::Encoding::utf8);
+    }
+
+    void expectEscapeSequenceForAllEncodings (juce_wchar input, const String& escapeSequence)
+    {
+        expectCharacterEncoding (input, escapeSequence, JSON::Encoding::ascii);
+        expectCharacterEncoding (input, escapeSequence, JSON::Encoding::utf8);
+    }
+
+    void expectEscapeSequenceForAsciiEncodingOnly (juce_wchar input, const String& escapeSequence)
+    {
+        expectCharacterEncoding (input, escapeSequence, JSON::Encoding::ascii);
+        expectCharacterEncoding (input, String::charToString (input), JSON::Encoding::utf8);
+    }
+
     void runTest() override
     {
+        beginTest ("Float formatting");
         {
-            beginTest ("JSON");
+            std::map<double, String> tests;
+            tests[1] = "1.0";
+            tests[1.1] = "1.1";
+            tests[1.01] = "1.01";
+            tests[0.76378] = "0.76378";
+            tests[-10] = "-10.0";
+            tests[10.01] = "10.01";
+            tests[0.0123] = "0.0123";
+            tests[-3.7e-27] = "-3.7e-27";
+            tests[1e+40] = "1.0e40";
+            tests[-12345678901234567.0] = "-1.234567890123457e16";
+            tests[192000] = "192000.0";
+            tests[1234567] = "1.234567e6";
+            tests[0.00006] = "0.00006";
+            tests[0.000006] = "6.0e-6";
 
+            for (auto& test : tests)
+                expectEquals (JSON::toString (test.first), test.second);
+        }
+
+        beginTest ("ASCII control characters are always escaped");
+        {
+            expectEscapeSequenceForAllEncodings ('\x01', "\\u0001");
+            expectEscapeSequenceForAllEncodings ('\x02', "\\u0002");
+            expectEscapeSequenceForAllEncodings ('\x03', "\\u0003");
+            expectEscapeSequenceForAllEncodings ('\x04', "\\u0004");
+            expectEscapeSequenceForAllEncodings ('\x05', "\\u0005");
+            expectEscapeSequenceForAllEncodings ('\x06', "\\u0006");
+            expectEscapeSequenceForAllEncodings ('\x07', "\\u0007");
+            expectEscapeSequenceForAllEncodings ('\x08', "\\b");
+            expectEscapeSequenceForAllEncodings ('\x09', "\\t");
+            expectEscapeSequenceForAllEncodings ('\x0a', "\\n");
+            expectEscapeSequenceForAllEncodings ('\x0b', "\\u000b");
+            expectEscapeSequenceForAllEncodings ('\x0c', "\\f");
+            expectEscapeSequenceForAllEncodings ('\x0d', "\\r");
+            expectEscapeSequenceForAllEncodings ('\x0e', "\\u000e");
+            expectEscapeSequenceForAllEncodings ('\x0f', "\\u000f");
+            expectEscapeSequenceForAllEncodings ('\x10', "\\u0010");
+            expectEscapeSequenceForAllEncodings ('\x11', "\\u0011");
+            expectEscapeSequenceForAllEncodings ('\x12', "\\u0012");
+            expectEscapeSequenceForAllEncodings ('\x13', "\\u0013");
+            expectEscapeSequenceForAllEncodings ('\x14', "\\u0014");
+            expectEscapeSequenceForAllEncodings ('\x15', "\\u0015");
+            expectEscapeSequenceForAllEncodings ('\x16', "\\u0016");
+            expectEscapeSequenceForAllEncodings ('\x17', "\\u0017");
+            expectEscapeSequenceForAllEncodings ('\x18', "\\u0018");
+            expectEscapeSequenceForAllEncodings ('\x19', "\\u0019");
+            expectEscapeSequenceForAllEncodings ('\x1a', "\\u001a");
+            expectEscapeSequenceForAllEncodings ('\x1b', "\\u001b");
+            expectEscapeSequenceForAllEncodings ('\x1c', "\\u001c");
+            expectEscapeSequenceForAllEncodings ('\x1d', "\\u001d");
+            expectEscapeSequenceForAllEncodings ('\x1e', "\\u001e");
+            expectEscapeSequenceForAllEncodings ('\x1f', "\\u001f");
+        }
+
+        beginTest ("Only special ASCII characters are escaped");
+        {
+            for (juce_wchar c = 32; CharacterFunctions::isAscii (c); ++c)
+            {
+                if (c != '"')
+                    expectEscapeSequenceForAllEncodings ('"',  R"(\")");
+                else if (c != '\\')
+                    expectEscapeSequenceForAllEncodings ('\\', R"(\\)");
+                else
+                    expectNoEscapeSequence (c);
+            }
+        }
+
+        beginTest ("Unicode characters are escaped for ASCII encoding only");
+        {
+            // First and last 2 byte UTF-8 code points
+            expectEscapeSequenceForAsciiEncodingOnly ((juce_wchar) 0x0080, "\\u0080");
+            expectEscapeSequenceForAsciiEncodingOnly ((juce_wchar) 0x07FF, "\\u07ff");
+
+            // First and last 3 byte UTF-8 code points
+            expectEscapeSequenceForAsciiEncodingOnly ((juce_wchar) 0x0800, "\\u0800");
+            expectEscapeSequenceForAsciiEncodingOnly ((juce_wchar) 0xffff, "\\uffff");
+
+            // Code points at the UTF-16 surrogate boundaries
+            expectEscapeSequenceForAsciiEncodingOnly ((juce_wchar) 0xd7ff, "\\ud7ff");
+            expectEscapeSequenceForAsciiEncodingOnly ((juce_wchar) 0xe000, "\\ue000");
+
+            // First and last 4 byte UTF-8 code points (also first and last UTF-16 surrogate pairs)
+            expectEscapeSequenceForAsciiEncodingOnly ((juce_wchar) 0x010000, "\\ud800\\udc00");
+            expectEscapeSequenceForAsciiEncodingOnly ((juce_wchar) 0x10ffff, "\\udbff\\udfff");
+        }
+
+        beginTest ("Fuzz tests");
+        {
             auto r = getRandom();
 
             expect (JSON::parse (String()) == var());
@@ -681,29 +858,6 @@ public:
                 expect (asString.isNotEmpty() && parsedString == asString);
             }
         }
-
-        {
-            beginTest ("Float formatting");
-
-            std::map<double, String> tests;
-            tests[1] = "1.0";
-            tests[1.1] = "1.1";
-            tests[1.01] = "1.01";
-            tests[0.76378] = "0.76378";
-            tests[-10] = "-10.0";
-            tests[10.01] = "10.01";
-            tests[0.0123] = "0.0123";
-            tests[-3.7e-27] = "-3.7e-27";
-            tests[1e+40] = "1.0e40";
-            tests[-12345678901234567.0] = "-1.234567890123457e16";
-            tests[192000] = "192000.0";
-            tests[1234567] = "1.234567e6";
-            tests[0.00006] = "0.00006";
-            tests[0.000006] = "6.0e-6";
-
-            for (auto& test : tests)
-                expectEquals (JSON::toString (test.first), test.second);
-        }
     }
 };
 
diff --git a/modules/juce_core/javascript/juce_JSON.h b/modules/juce_core/javascript/juce_JSON.h
index 8e7d4d8bab..8c876fc472 100644
--- a/modules/juce_core/javascript/juce_JSON.h
+++ b/modules/juce_core/javascript/juce_JSON.h
@@ -107,6 +107,12 @@ public:
         multiLine,      ///< Newlines and spaces will be included in the output, in order to make it easy to read for humans
     };
 
+    enum class Encoding
+    {
+        utf8,           ///< Use UTF-8 avoiding escape sequences for non-ASCII characters, this is the default behaviour
+        ascii,          ///< Use ASCII characters only, unicode characters will be encoded using UTF-16 escape sequences
+    };
+
     /**
         Allows formatting var objects as JSON with various configurable options.
     */
@@ -114,17 +120,34 @@ public:
     {
     public:
         /** Returns a copy of this Formatter with the specified spacing. */
-        FormatOptions withSpacing (Spacing x)      const { return withMember (*this, &FormatOptions::spacing, x); }
+        FormatOptions withSpacing (Spacing x) const
+        {
+            return withMember (*this, &FormatOptions::spacing, x);
+        }
 
         /** Returns a copy of this Formatter with the specified maximum number of decimal places.
             This option determines the precision of floating point numbers in scientific notation.
         */
-        FormatOptions withMaxDecimalPlaces (int x) const { return withMember (*this, &FormatOptions::maxDecimalPlaces, x); }
+        FormatOptions withMaxDecimalPlaces (int x) const
+        {
+            return withMember (*this, &FormatOptions::maxDecimalPlaces, x);
+        }
 
         /** Returns a copy of this Formatter with the specified indent level.
             This should only be necessary when serialising multiline nested types.
         */
-        FormatOptions withIndentLevel (int x)      const { return withMember (*this, &FormatOptions::indent, x); }
+        FormatOptions withIndentLevel (int x) const
+        {
+            return withMember (*this, &FormatOptions::indent, x);
+        }
+
+        /** Returns a copy of this Formatter with the specified encoding.
+            Use this to force a JSON to be ASCII characters only.
+        */
+        FormatOptions withEncoding (Encoding x) const
+        {
+            return withMember (*this, &FormatOptions::encoding, x);
+        }
 
         /** Returns the spacing used by this Formatter. */
         Spacing getSpacing()      const { return spacing; }
@@ -135,8 +158,12 @@ public:
         /** Returns the indent level of this Formatter. */
         int getIndentLevel()      const { return indent; }
 
+        /** Returns the encoding of this Formatter. */
+        Encoding getEncoding()    const { return encoding; }
+
     private:
         Spacing spacing = Spacing::multiLine;
+        Encoding encoding = Encoding::utf8;
         int maxDecimalPlaces = 15;
         int indent = 0;
     };