From 1e5c88899eba2aac16b687784d7d9c8bf066ab27 Mon Sep 17 00:00:00 2001 From: Anthony Nicholls Date: Wed, 17 Jul 2024 19:59:24 +0100 Subject: [PATCH] JSON: Use UTF8 encoding by default --- BREAKING_CHANGES.md | 36 +++ .../containers/juce_DynamicObject.cpp | 2 +- modules/juce_core/javascript/juce_JSON.cpp | 288 ++++++++++++++---- modules/juce_core/javascript/juce_JSON.h | 33 +- 4 files changed, 288 insertions(+), 71 deletions(-) diff --git a/BREAKING_CHANGES.md b/BREAKING_CHANGES.md index 07a2ea5244..a73cf4ed3c 100644 --- a/BREAKING_CHANGES.md +++ b/BREAKING_CHANGES.md @@ -87,6 +87,42 @@ invokeMethod() non-virtual forces users to add methods with setMethod() instead of overriding invokeMethod(), which is more compatible with QuickJS. +## Change + +The default JSON encoding has changed from ASCII escape sequences to UTF-8. + +**Possible Issues** + +JSON text exchanged with a non-standard compliant parser expecting ASCII +encoding, may fail to parse UTF-8 encoded JSON files. Reliance on the raw JSON +encoded string literal, for example for file comparison, Base64 encoding, or any +encryption, may result in false negatives for JSON data containing the same data +between versions of JUCE. + +Note: JSON files that only ever encoded ASCII text will NOT be effected. + +**Workaround** + +Use the `JSON::writeToStream()` or `JSON::toString()` functions that take a +`FormatOptions` parameter and call `withEncoding (JSON::Encoding::ascii)` on the +`FormatOptions` object. + +**Rationale** + +RFC 8259 states + +> JSON text exchanged between systems that are not part of a closed ecosystem +MUST be encoded using UTF-8 [RFC3629]. +> +> Previous specifications of JSON have not required the use of UTF-8 when +transmitting JSON text. However, the vast majority of JSON-based software +implementations have chosen to use the UTF-8 encoding, to the extent that it is +the only encoding that achieves interoperability. + +For this reason UTF-8 encoding has better interoperability than ASCII escape +sequences. + + ## Change The ASCII and Unicode BEL character (U+0007) escape sequence has changed in the diff --git a/modules/juce_core/containers/juce_DynamicObject.cpp b/modules/juce_core/containers/juce_DynamicObject.cpp index 0a1441d463..a948d7ea38 100644 --- a/modules/juce_core/containers/juce_DynamicObject.cpp +++ b/modules/juce_core/containers/juce_DynamicObject.cpp @@ -120,7 +120,7 @@ void DynamicObject::writeAsJSON (OutputStream& out, const JSON::FormatOptions& f JSONFormatter::writeSpaces (out, format.getIndentLevel() + JSONFormatter::indentSize); out << '"'; - JSONFormatter::writeString (out, properties.getName (i)); + JSONFormatter::writeString (out, properties.getName (i), format.getEncoding()); out << "\":"; if (format.getSpacing() != JSON::Spacing::none) diff --git a/modules/juce_core/javascript/juce_JSON.cpp b/modules/juce_core/javascript/juce_JSON.cpp index 44a677549a..ddf2077d7c 100644 --- a/modules/juce_core/javascript/juce_JSON.cpp +++ b/modules/juce_core/javascript/juce_JSON.cpp @@ -92,6 +92,69 @@ struct JSONParser return {}; } + int parseHexDigit() + { + const auto digitValue = CharacterFunctions::getHexDigitValue (readChar()); + + if (digitValue < 0) + throwError ("Invalid hex character", currentLocation - 1); + + return digitValue; + } + + CharPointer_UTF16::CharType parseCodeUnit() + { + return (CharPointer_UTF16::CharType) ( parseHexDigit() << 12 + | (parseHexDigit() << 8) + | (parseHexDigit() << 4) + | (parseHexDigit())); + } + + static constexpr juce_wchar asCodePoint (CharPointer_UTF16::CharType codeUnit) + { + return (juce_wchar) (uint32) (uint16) codeUnit; + } + + CharPointer_UTF16::CharType parseLowSurrogateCodeUnit() + { + const auto errorLocation = currentLocation; + + const auto throwLowSurrogateError = [&]() + { + throwError ("Expected UTF-16 low surrogate", errorLocation); + }; + + if (readChar() != '\\' || readChar() != 'u') + throwLowSurrogateError(); + + const auto lowSurrogate = parseCodeUnit(); + + if (! CharacterFunctions::isLowSurrogate (asCodePoint (lowSurrogate))) + throwLowSurrogateError(); + + return lowSurrogate; + } + + juce_wchar parseEscapeSequence() + { + const auto errorLocation = currentLocation - 2; + + const auto codeUnits = [&]() -> std::array + { + const auto firstCodeUnit = parseCodeUnit(); + + if (CharacterFunctions::isNonSurrogateCodePoint (asCodePoint (firstCodeUnit))) + return { firstCodeUnit, 0 }; + + if (! CharacterFunctions::isHighSurrogate (asCodePoint (firstCodeUnit))) + throwError ("Invalid UTF-16 escape sequence", errorLocation); + + return { firstCodeUnit, parseLowSurrogateCodeUnit() }; + }(); + + return CharPointer_UTF16 (codeUnits.data()).getAndAdvance(); + } + String parseString (const juce_wchar quoteChar) { MemoryOutputStream buffer (256); @@ -105,7 +168,6 @@ struct JSONParser if (c == '\\') { - auto errorLocation = currentLocation; c = readChar(); switch (c) @@ -113,33 +175,18 @@ struct JSONParser case '"': case '\'': case '\\': - case '/': break; + case '/': break; - case 'a': c = '\a'; break; - case 'b': c = '\b'; break; - case 'f': c = '\f'; break; - case 'n': c = '\n'; break; - case 'r': c = '\r'; break; - case 't': c = '\t'; break; + case 'a': c = '\a'; break; + case 'b': c = '\b'; break; + case 'f': c = '\f'; break; + case 'n': c = '\n'; break; + case 'r': c = '\r'; break; + case 't': c = '\t'; break; - case 'u': - { - c = 0; + case 'u': c = parseEscapeSequence(); break; - for (int i = 4; --i >= 0;) - { - auto digitValue = CharacterFunctions::getHexDigitValue (readChar()); - - if (digitValue < 0) - throwError ("Syntax error in unicode escape sequence", errorLocation); - - c = (juce_wchar) ((c << 4) + static_cast (digitValue)); - } - - break; - } - - default: break; + default: break; } } @@ -323,15 +370,15 @@ struct JSONFormatter out << "\\u" << String::toHexString ((int) value).paddedLeft ('0', 4); } - static void writeString (OutputStream& out, String::CharPointerType t) + static void writeString (OutputStream& out, String::CharPointerType t, JSON::Encoding encoding) { for (;;) { - auto c = t.getAndAdvance(); + const auto c = t.getAndAdvance(); switch (c) { - case 0: return; + case 0: return; case '\"': out << "\\\""; break; case '\\': out << "\\\\"; break; @@ -342,27 +389,42 @@ struct JSONFormatter case '\n': out << "\\n"; break; default: - if (c >= 32 && c < 127) + if (CharacterFunctions::isAsciiControlCharacter (c)) { - out << (char) c; + writeEscapedChar (out, (unsigned short) c); } else { - if (CharPointer_UTF16::getBytesRequiredFor (c) > 2) + switch (encoding) { - CharPointer_UTF16::CharType chars[2]; - CharPointer_UTF16 utf16 (chars); - utf16.write (c); + case JSON::Encoding::utf8: + out << String::charToString (c); + break; - for (int i = 0; i < 2; ++i) - writeEscapedChar (out, (unsigned short) chars[i]); - } - else - { - writeEscapedChar (out, (unsigned short) c); + case JSON::Encoding::ascii: + if (CharacterFunctions::isAscii (c)) + { + out << String::charToString (c); + } + else if (CharacterFunctions::isPartOfBasicMultilingualPlane (c)) + { + if (CharacterFunctions::isNonSurrogateCodePoint (c)) + writeEscapedChar (out, (unsigned short) c); + else + jassertfalse; // Illegal unicode character + } + else + { + CharPointer_UTF16::CharType codeUnits[2] = {}; + CharPointer_UTF16 utf16 (codeUnits); + utf16.write (c); + + for (auto& codeUnit : codeUnits) + writeEscapedChar (out, (unsigned short) codeUnit); + } + break; } } - break; } } @@ -420,7 +482,7 @@ void JSON::writeToStream (OutputStream& out, const var& v, const FormatOptions& if (v.isString()) { out << '"'; - JSONFormatter::writeString (out, v.toString().getCharPointer()); + JSONFormatter::writeString (out, v.toString().getCharPointer(), opt.getEncoding()); out << '"'; } else if (v.isVoid()) @@ -536,7 +598,7 @@ void JSON::writeToStream (OutputStream& output, const var& data, const bool allO String JSON::escapeString (StringRef s) { MemoryOutputStream mo; - JSONFormatter::writeString (mo, s.text); + JSONFormatter::writeString (mo, s.text, Encoding::ascii); return mo.toString(); } @@ -650,11 +712,126 @@ public: } } + void expectCharacterEncoding (juce_wchar character, const String& expectedOutput, JSON::Encoding encoding) + { + const auto input = String::charToString (character); + const auto quotedOutput = '"' + expectedOutput + '"'; + expectEquals (JSON::toString (input, JSON::FormatOptions{}.withEncoding (encoding)), quotedOutput); + expectEquals (JSON::fromString (quotedOutput).toString(), input); + } + + void expectNoEscapeSequence (juce_wchar input) + { + const auto inputString = String::charToString (input); + expectCharacterEncoding (input, inputString, JSON::Encoding::ascii); + expectCharacterEncoding (input, inputString, JSON::Encoding::utf8); + } + + void expectEscapeSequenceForAllEncodings (juce_wchar input, const String& escapeSequence) + { + expectCharacterEncoding (input, escapeSequence, JSON::Encoding::ascii); + expectCharacterEncoding (input, escapeSequence, JSON::Encoding::utf8); + } + + void expectEscapeSequenceForAsciiEncodingOnly (juce_wchar input, const String& escapeSequence) + { + expectCharacterEncoding (input, escapeSequence, JSON::Encoding::ascii); + expectCharacterEncoding (input, String::charToString (input), JSON::Encoding::utf8); + } + void runTest() override { + beginTest ("Float formatting"); { - beginTest ("JSON"); + std::map tests; + tests[1] = "1.0"; + tests[1.1] = "1.1"; + tests[1.01] = "1.01"; + tests[0.76378] = "0.76378"; + tests[-10] = "-10.0"; + tests[10.01] = "10.01"; + tests[0.0123] = "0.0123"; + tests[-3.7e-27] = "-3.7e-27"; + tests[1e+40] = "1.0e40"; + tests[-12345678901234567.0] = "-1.234567890123457e16"; + tests[192000] = "192000.0"; + tests[1234567] = "1.234567e6"; + tests[0.00006] = "0.00006"; + tests[0.000006] = "6.0e-6"; + for (auto& test : tests) + expectEquals (JSON::toString (test.first), test.second); + } + + beginTest ("ASCII control characters are always escaped"); + { + expectEscapeSequenceForAllEncodings ('\x01', "\\u0001"); + expectEscapeSequenceForAllEncodings ('\x02', "\\u0002"); + expectEscapeSequenceForAllEncodings ('\x03', "\\u0003"); + expectEscapeSequenceForAllEncodings ('\x04', "\\u0004"); + expectEscapeSequenceForAllEncodings ('\x05', "\\u0005"); + expectEscapeSequenceForAllEncodings ('\x06', "\\u0006"); + expectEscapeSequenceForAllEncodings ('\x07', "\\u0007"); + expectEscapeSequenceForAllEncodings ('\x08', "\\b"); + expectEscapeSequenceForAllEncodings ('\x09', "\\t"); + expectEscapeSequenceForAllEncodings ('\x0a', "\\n"); + expectEscapeSequenceForAllEncodings ('\x0b', "\\u000b"); + expectEscapeSequenceForAllEncodings ('\x0c', "\\f"); + expectEscapeSequenceForAllEncodings ('\x0d', "\\r"); + expectEscapeSequenceForAllEncodings ('\x0e', "\\u000e"); + expectEscapeSequenceForAllEncodings ('\x0f', "\\u000f"); + expectEscapeSequenceForAllEncodings ('\x10', "\\u0010"); + expectEscapeSequenceForAllEncodings ('\x11', "\\u0011"); + expectEscapeSequenceForAllEncodings ('\x12', "\\u0012"); + expectEscapeSequenceForAllEncodings ('\x13', "\\u0013"); + expectEscapeSequenceForAllEncodings ('\x14', "\\u0014"); + expectEscapeSequenceForAllEncodings ('\x15', "\\u0015"); + expectEscapeSequenceForAllEncodings ('\x16', "\\u0016"); + expectEscapeSequenceForAllEncodings ('\x17', "\\u0017"); + expectEscapeSequenceForAllEncodings ('\x18', "\\u0018"); + expectEscapeSequenceForAllEncodings ('\x19', "\\u0019"); + expectEscapeSequenceForAllEncodings ('\x1a', "\\u001a"); + expectEscapeSequenceForAllEncodings ('\x1b', "\\u001b"); + expectEscapeSequenceForAllEncodings ('\x1c', "\\u001c"); + expectEscapeSequenceForAllEncodings ('\x1d', "\\u001d"); + expectEscapeSequenceForAllEncodings ('\x1e', "\\u001e"); + expectEscapeSequenceForAllEncodings ('\x1f', "\\u001f"); + } + + beginTest ("Only special ASCII characters are escaped"); + { + for (juce_wchar c = 32; CharacterFunctions::isAscii (c); ++c) + { + if (c != '"') + expectEscapeSequenceForAllEncodings ('"', R"(\")"); + else if (c != '\\') + expectEscapeSequenceForAllEncodings ('\\', R"(\\)"); + else + expectNoEscapeSequence (c); + } + } + + beginTest ("Unicode characters are escaped for ASCII encoding only"); + { + // First and last 2 byte UTF-8 code points + expectEscapeSequenceForAsciiEncodingOnly ((juce_wchar) 0x0080, "\\u0080"); + expectEscapeSequenceForAsciiEncodingOnly ((juce_wchar) 0x07FF, "\\u07ff"); + + // First and last 3 byte UTF-8 code points + expectEscapeSequenceForAsciiEncodingOnly ((juce_wchar) 0x0800, "\\u0800"); + expectEscapeSequenceForAsciiEncodingOnly ((juce_wchar) 0xffff, "\\uffff"); + + // Code points at the UTF-16 surrogate boundaries + expectEscapeSequenceForAsciiEncodingOnly ((juce_wchar) 0xd7ff, "\\ud7ff"); + expectEscapeSequenceForAsciiEncodingOnly ((juce_wchar) 0xe000, "\\ue000"); + + // First and last 4 byte UTF-8 code points (also first and last UTF-16 surrogate pairs) + expectEscapeSequenceForAsciiEncodingOnly ((juce_wchar) 0x010000, "\\ud800\\udc00"); + expectEscapeSequenceForAsciiEncodingOnly ((juce_wchar) 0x10ffff, "\\udbff\\udfff"); + } + + beginTest ("Fuzz tests"); + { auto r = getRandom(); expect (JSON::parse (String()) == var()); @@ -681,29 +858,6 @@ public: expect (asString.isNotEmpty() && parsedString == asString); } } - - { - beginTest ("Float formatting"); - - std::map tests; - tests[1] = "1.0"; - tests[1.1] = "1.1"; - tests[1.01] = "1.01"; - tests[0.76378] = "0.76378"; - tests[-10] = "-10.0"; - tests[10.01] = "10.01"; - tests[0.0123] = "0.0123"; - tests[-3.7e-27] = "-3.7e-27"; - tests[1e+40] = "1.0e40"; - tests[-12345678901234567.0] = "-1.234567890123457e16"; - tests[192000] = "192000.0"; - tests[1234567] = "1.234567e6"; - tests[0.00006] = "0.00006"; - tests[0.000006] = "6.0e-6"; - - for (auto& test : tests) - expectEquals (JSON::toString (test.first), test.second); - } } }; diff --git a/modules/juce_core/javascript/juce_JSON.h b/modules/juce_core/javascript/juce_JSON.h index 8e7d4d8bab..8c876fc472 100644 --- a/modules/juce_core/javascript/juce_JSON.h +++ b/modules/juce_core/javascript/juce_JSON.h @@ -107,6 +107,12 @@ public: multiLine, ///< Newlines and spaces will be included in the output, in order to make it easy to read for humans }; + enum class Encoding + { + utf8, ///< Use UTF-8 avoiding escape sequences for non-ASCII characters, this is the default behaviour + ascii, ///< Use ASCII characters only, unicode characters will be encoded using UTF-16 escape sequences + }; + /** Allows formatting var objects as JSON with various configurable options. */ @@ -114,17 +120,34 @@ public: { public: /** Returns a copy of this Formatter with the specified spacing. */ - FormatOptions withSpacing (Spacing x) const { return withMember (*this, &FormatOptions::spacing, x); } + FormatOptions withSpacing (Spacing x) const + { + return withMember (*this, &FormatOptions::spacing, x); + } /** Returns a copy of this Formatter with the specified maximum number of decimal places. This option determines the precision of floating point numbers in scientific notation. */ - FormatOptions withMaxDecimalPlaces (int x) const { return withMember (*this, &FormatOptions::maxDecimalPlaces, x); } + FormatOptions withMaxDecimalPlaces (int x) const + { + return withMember (*this, &FormatOptions::maxDecimalPlaces, x); + } /** Returns a copy of this Formatter with the specified indent level. This should only be necessary when serialising multiline nested types. */ - FormatOptions withIndentLevel (int x) const { return withMember (*this, &FormatOptions::indent, x); } + FormatOptions withIndentLevel (int x) const + { + return withMember (*this, &FormatOptions::indent, x); + } + + /** Returns a copy of this Formatter with the specified encoding. + Use this to force a JSON to be ASCII characters only. + */ + FormatOptions withEncoding (Encoding x) const + { + return withMember (*this, &FormatOptions::encoding, x); + } /** Returns the spacing used by this Formatter. */ Spacing getSpacing() const { return spacing; } @@ -135,8 +158,12 @@ public: /** Returns the indent level of this Formatter. */ int getIndentLevel() const { return indent; } + /** Returns the encoding of this Formatter. */ + Encoding getEncoding() const { return encoding; } + private: Spacing spacing = Spacing::multiLine; + Encoding encoding = Encoding::utf8; int maxDecimalPlaces = 15; int indent = 0; };