1
0
Fork 0
mirror of https://github.com/juce-framework/JUCE.git synced 2026-01-09 23:34:20 +00:00

JSON: Use UTF8 encoding by default

This commit is contained in:
Anthony Nicholls 2024-07-17 19:59:24 +01:00
parent 379afb1e3f
commit 1e5c88899e
4 changed files with 288 additions and 71 deletions

View file

@ -87,6 +87,42 @@ invokeMethod() non-virtual forces users to add methods with setMethod() instead
of overriding invokeMethod(), which is more compatible with QuickJS.
## Change
The default JSON encoding has changed from ASCII escape sequences to UTF-8.
**Possible Issues**
JSON text exchanged with a non-standard compliant parser expecting ASCII
encoding, may fail to parse UTF-8 encoded JSON files. Reliance on the raw JSON
encoded string literal, for example for file comparison, Base64 encoding, or any
encryption, may result in false negatives for JSON data containing the same data
between versions of JUCE.
Note: JSON files that only ever encoded ASCII text will NOT be effected.
**Workaround**
Use the `JSON::writeToStream()` or `JSON::toString()` functions that take a
`FormatOptions` parameter and call `withEncoding (JSON::Encoding::ascii)` on the
`FormatOptions` object.
**Rationale**
RFC 8259 states
> JSON text exchanged between systems that are not part of a closed ecosystem
MUST be encoded using UTF-8 [RFC3629].
>
> Previous specifications of JSON have not required the use of UTF-8 when
transmitting JSON text. However, the vast majority of JSON-based software
implementations have chosen to use the UTF-8 encoding, to the extent that it is
the only encoding that achieves interoperability.
For this reason UTF-8 encoding has better interoperability than ASCII escape
sequences.
## Change
The ASCII and Unicode BEL character (U+0007) escape sequence has changed in the

View file

@ -120,7 +120,7 @@ void DynamicObject::writeAsJSON (OutputStream& out, const JSON::FormatOptions& f
JSONFormatter::writeSpaces (out, format.getIndentLevel() + JSONFormatter::indentSize);
out << '"';
JSONFormatter::writeString (out, properties.getName (i));
JSONFormatter::writeString (out, properties.getName (i), format.getEncoding());
out << "\":";
if (format.getSpacing() != JSON::Spacing::none)

View file

@ -92,6 +92,69 @@ struct JSONParser
return {};
}
int parseHexDigit()
{
const auto digitValue = CharacterFunctions::getHexDigitValue (readChar());
if (digitValue < 0)
throwError ("Invalid hex character", currentLocation - 1);
return digitValue;
}
CharPointer_UTF16::CharType parseCodeUnit()
{
return (CharPointer_UTF16::CharType) ( parseHexDigit() << 12
| (parseHexDigit() << 8)
| (parseHexDigit() << 4)
| (parseHexDigit()));
}
static constexpr juce_wchar asCodePoint (CharPointer_UTF16::CharType codeUnit)
{
return (juce_wchar) (uint32) (uint16) codeUnit;
}
CharPointer_UTF16::CharType parseLowSurrogateCodeUnit()
{
const auto errorLocation = currentLocation;
const auto throwLowSurrogateError = [&]()
{
throwError ("Expected UTF-16 low surrogate", errorLocation);
};
if (readChar() != '\\' || readChar() != 'u')
throwLowSurrogateError();
const auto lowSurrogate = parseCodeUnit();
if (! CharacterFunctions::isLowSurrogate (asCodePoint (lowSurrogate)))
throwLowSurrogateError();
return lowSurrogate;
}
juce_wchar parseEscapeSequence()
{
const auto errorLocation = currentLocation - 2;
const auto codeUnits = [&]() -> std::array<CharPointer_UTF16::CharType, 2>
{
const auto firstCodeUnit = parseCodeUnit();
if (CharacterFunctions::isNonSurrogateCodePoint (asCodePoint (firstCodeUnit)))
return { firstCodeUnit, 0 };
if (! CharacterFunctions::isHighSurrogate (asCodePoint (firstCodeUnit)))
throwError ("Invalid UTF-16 escape sequence", errorLocation);
return { firstCodeUnit, parseLowSurrogateCodeUnit() };
}();
return CharPointer_UTF16 (codeUnits.data()).getAndAdvance();
}
String parseString (const juce_wchar quoteChar)
{
MemoryOutputStream buffer (256);
@ -105,7 +168,6 @@ struct JSONParser
if (c == '\\')
{
auto errorLocation = currentLocation;
c = readChar();
switch (c)
@ -113,33 +175,18 @@ struct JSONParser
case '"':
case '\'':
case '\\':
case '/': break;
case '/': break;
case 'a': c = '\a'; break;
case 'b': c = '\b'; break;
case 'f': c = '\f'; break;
case 'n': c = '\n'; break;
case 'r': c = '\r'; break;
case 't': c = '\t'; break;
case 'a': c = '\a'; break;
case 'b': c = '\b'; break;
case 'f': c = '\f'; break;
case 'n': c = '\n'; break;
case 'r': c = '\r'; break;
case 't': c = '\t'; break;
case 'u':
{
c = 0;
case 'u': c = parseEscapeSequence(); break;
for (int i = 4; --i >= 0;)
{
auto digitValue = CharacterFunctions::getHexDigitValue (readChar());
if (digitValue < 0)
throwError ("Syntax error in unicode escape sequence", errorLocation);
c = (juce_wchar) ((c << 4) + static_cast<juce_wchar> (digitValue));
}
break;
}
default: break;
default: break;
}
}
@ -323,15 +370,15 @@ struct JSONFormatter
out << "\\u" << String::toHexString ((int) value).paddedLeft ('0', 4);
}
static void writeString (OutputStream& out, String::CharPointerType t)
static void writeString (OutputStream& out, String::CharPointerType t, JSON::Encoding encoding)
{
for (;;)
{
auto c = t.getAndAdvance();
const auto c = t.getAndAdvance();
switch (c)
{
case 0: return;
case 0: return;
case '\"': out << "\\\""; break;
case '\\': out << "\\\\"; break;
@ -342,27 +389,42 @@ struct JSONFormatter
case '\n': out << "\\n"; break;
default:
if (c >= 32 && c < 127)
if (CharacterFunctions::isAsciiControlCharacter (c))
{
out << (char) c;
writeEscapedChar (out, (unsigned short) c);
}
else
{
if (CharPointer_UTF16::getBytesRequiredFor (c) > 2)
switch (encoding)
{
CharPointer_UTF16::CharType chars[2];
CharPointer_UTF16 utf16 (chars);
utf16.write (c);
case JSON::Encoding::utf8:
out << String::charToString (c);
break;
for (int i = 0; i < 2; ++i)
writeEscapedChar (out, (unsigned short) chars[i]);
}
else
{
writeEscapedChar (out, (unsigned short) c);
case JSON::Encoding::ascii:
if (CharacterFunctions::isAscii (c))
{
out << String::charToString (c);
}
else if (CharacterFunctions::isPartOfBasicMultilingualPlane (c))
{
if (CharacterFunctions::isNonSurrogateCodePoint (c))
writeEscapedChar (out, (unsigned short) c);
else
jassertfalse; // Illegal unicode character
}
else
{
CharPointer_UTF16::CharType codeUnits[2] = {};
CharPointer_UTF16 utf16 (codeUnits);
utf16.write (c);
for (auto& codeUnit : codeUnits)
writeEscapedChar (out, (unsigned short) codeUnit);
}
break;
}
}
break;
}
}
@ -420,7 +482,7 @@ void JSON::writeToStream (OutputStream& out, const var& v, const FormatOptions&
if (v.isString())
{
out << '"';
JSONFormatter::writeString (out, v.toString().getCharPointer());
JSONFormatter::writeString (out, v.toString().getCharPointer(), opt.getEncoding());
out << '"';
}
else if (v.isVoid())
@ -536,7 +598,7 @@ void JSON::writeToStream (OutputStream& output, const var& data, const bool allO
String JSON::escapeString (StringRef s)
{
MemoryOutputStream mo;
JSONFormatter::writeString (mo, s.text);
JSONFormatter::writeString (mo, s.text, Encoding::ascii);
return mo.toString();
}
@ -650,11 +712,126 @@ public:
}
}
void expectCharacterEncoding (juce_wchar character, const String& expectedOutput, JSON::Encoding encoding)
{
const auto input = String::charToString (character);
const auto quotedOutput = '"' + expectedOutput + '"';
expectEquals (JSON::toString (input, JSON::FormatOptions{}.withEncoding (encoding)), quotedOutput);
expectEquals (JSON::fromString (quotedOutput).toString(), input);
}
void expectNoEscapeSequence (juce_wchar input)
{
const auto inputString = String::charToString (input);
expectCharacterEncoding (input, inputString, JSON::Encoding::ascii);
expectCharacterEncoding (input, inputString, JSON::Encoding::utf8);
}
void expectEscapeSequenceForAllEncodings (juce_wchar input, const String& escapeSequence)
{
expectCharacterEncoding (input, escapeSequence, JSON::Encoding::ascii);
expectCharacterEncoding (input, escapeSequence, JSON::Encoding::utf8);
}
void expectEscapeSequenceForAsciiEncodingOnly (juce_wchar input, const String& escapeSequence)
{
expectCharacterEncoding (input, escapeSequence, JSON::Encoding::ascii);
expectCharacterEncoding (input, String::charToString (input), JSON::Encoding::utf8);
}
void runTest() override
{
beginTest ("Float formatting");
{
beginTest ("JSON");
std::map<double, String> tests;
tests[1] = "1.0";
tests[1.1] = "1.1";
tests[1.01] = "1.01";
tests[0.76378] = "0.76378";
tests[-10] = "-10.0";
tests[10.01] = "10.01";
tests[0.0123] = "0.0123";
tests[-3.7e-27] = "-3.7e-27";
tests[1e+40] = "1.0e40";
tests[-12345678901234567.0] = "-1.234567890123457e16";
tests[192000] = "192000.0";
tests[1234567] = "1.234567e6";
tests[0.00006] = "0.00006";
tests[0.000006] = "6.0e-6";
for (auto& test : tests)
expectEquals (JSON::toString (test.first), test.second);
}
beginTest ("ASCII control characters are always escaped");
{
expectEscapeSequenceForAllEncodings ('\x01', "\\u0001");
expectEscapeSequenceForAllEncodings ('\x02', "\\u0002");
expectEscapeSequenceForAllEncodings ('\x03', "\\u0003");
expectEscapeSequenceForAllEncodings ('\x04', "\\u0004");
expectEscapeSequenceForAllEncodings ('\x05', "\\u0005");
expectEscapeSequenceForAllEncodings ('\x06', "\\u0006");
expectEscapeSequenceForAllEncodings ('\x07', "\\u0007");
expectEscapeSequenceForAllEncodings ('\x08', "\\b");
expectEscapeSequenceForAllEncodings ('\x09', "\\t");
expectEscapeSequenceForAllEncodings ('\x0a', "\\n");
expectEscapeSequenceForAllEncodings ('\x0b', "\\u000b");
expectEscapeSequenceForAllEncodings ('\x0c', "\\f");
expectEscapeSequenceForAllEncodings ('\x0d', "\\r");
expectEscapeSequenceForAllEncodings ('\x0e', "\\u000e");
expectEscapeSequenceForAllEncodings ('\x0f', "\\u000f");
expectEscapeSequenceForAllEncodings ('\x10', "\\u0010");
expectEscapeSequenceForAllEncodings ('\x11', "\\u0011");
expectEscapeSequenceForAllEncodings ('\x12', "\\u0012");
expectEscapeSequenceForAllEncodings ('\x13', "\\u0013");
expectEscapeSequenceForAllEncodings ('\x14', "\\u0014");
expectEscapeSequenceForAllEncodings ('\x15', "\\u0015");
expectEscapeSequenceForAllEncodings ('\x16', "\\u0016");
expectEscapeSequenceForAllEncodings ('\x17', "\\u0017");
expectEscapeSequenceForAllEncodings ('\x18', "\\u0018");
expectEscapeSequenceForAllEncodings ('\x19', "\\u0019");
expectEscapeSequenceForAllEncodings ('\x1a', "\\u001a");
expectEscapeSequenceForAllEncodings ('\x1b', "\\u001b");
expectEscapeSequenceForAllEncodings ('\x1c', "\\u001c");
expectEscapeSequenceForAllEncodings ('\x1d', "\\u001d");
expectEscapeSequenceForAllEncodings ('\x1e', "\\u001e");
expectEscapeSequenceForAllEncodings ('\x1f', "\\u001f");
}
beginTest ("Only special ASCII characters are escaped");
{
for (juce_wchar c = 32; CharacterFunctions::isAscii (c); ++c)
{
if (c != '"')
expectEscapeSequenceForAllEncodings ('"', R"(\")");
else if (c != '\\')
expectEscapeSequenceForAllEncodings ('\\', R"(\\)");
else
expectNoEscapeSequence (c);
}
}
beginTest ("Unicode characters are escaped for ASCII encoding only");
{
// First and last 2 byte UTF-8 code points
expectEscapeSequenceForAsciiEncodingOnly ((juce_wchar) 0x0080, "\\u0080");
expectEscapeSequenceForAsciiEncodingOnly ((juce_wchar) 0x07FF, "\\u07ff");
// First and last 3 byte UTF-8 code points
expectEscapeSequenceForAsciiEncodingOnly ((juce_wchar) 0x0800, "\\u0800");
expectEscapeSequenceForAsciiEncodingOnly ((juce_wchar) 0xffff, "\\uffff");
// Code points at the UTF-16 surrogate boundaries
expectEscapeSequenceForAsciiEncodingOnly ((juce_wchar) 0xd7ff, "\\ud7ff");
expectEscapeSequenceForAsciiEncodingOnly ((juce_wchar) 0xe000, "\\ue000");
// First and last 4 byte UTF-8 code points (also first and last UTF-16 surrogate pairs)
expectEscapeSequenceForAsciiEncodingOnly ((juce_wchar) 0x010000, "\\ud800\\udc00");
expectEscapeSequenceForAsciiEncodingOnly ((juce_wchar) 0x10ffff, "\\udbff\\udfff");
}
beginTest ("Fuzz tests");
{
auto r = getRandom();
expect (JSON::parse (String()) == var());
@ -681,29 +858,6 @@ public:
expect (asString.isNotEmpty() && parsedString == asString);
}
}
{
beginTest ("Float formatting");
std::map<double, String> tests;
tests[1] = "1.0";
tests[1.1] = "1.1";
tests[1.01] = "1.01";
tests[0.76378] = "0.76378";
tests[-10] = "-10.0";
tests[10.01] = "10.01";
tests[0.0123] = "0.0123";
tests[-3.7e-27] = "-3.7e-27";
tests[1e+40] = "1.0e40";
tests[-12345678901234567.0] = "-1.234567890123457e16";
tests[192000] = "192000.0";
tests[1234567] = "1.234567e6";
tests[0.00006] = "0.00006";
tests[0.000006] = "6.0e-6";
for (auto& test : tests)
expectEquals (JSON::toString (test.first), test.second);
}
}
};

View file

@ -107,6 +107,12 @@ public:
multiLine, ///< Newlines and spaces will be included in the output, in order to make it easy to read for humans
};
enum class Encoding
{
utf8, ///< Use UTF-8 avoiding escape sequences for non-ASCII characters, this is the default behaviour
ascii, ///< Use ASCII characters only, unicode characters will be encoded using UTF-16 escape sequences
};
/**
Allows formatting var objects as JSON with various configurable options.
*/
@ -114,17 +120,34 @@ public:
{
public:
/** Returns a copy of this Formatter with the specified spacing. */
FormatOptions withSpacing (Spacing x) const { return withMember (*this, &FormatOptions::spacing, x); }
FormatOptions withSpacing (Spacing x) const
{
return withMember (*this, &FormatOptions::spacing, x);
}
/** Returns a copy of this Formatter with the specified maximum number of decimal places.
This option determines the precision of floating point numbers in scientific notation.
*/
FormatOptions withMaxDecimalPlaces (int x) const { return withMember (*this, &FormatOptions::maxDecimalPlaces, x); }
FormatOptions withMaxDecimalPlaces (int x) const
{
return withMember (*this, &FormatOptions::maxDecimalPlaces, x);
}
/** Returns a copy of this Formatter with the specified indent level.
This should only be necessary when serialising multiline nested types.
*/
FormatOptions withIndentLevel (int x) const { return withMember (*this, &FormatOptions::indent, x); }
FormatOptions withIndentLevel (int x) const
{
return withMember (*this, &FormatOptions::indent, x);
}
/** Returns a copy of this Formatter with the specified encoding.
Use this to force a JSON to be ASCII characters only.
*/
FormatOptions withEncoding (Encoding x) const
{
return withMember (*this, &FormatOptions::encoding, x);
}
/** Returns the spacing used by this Formatter. */
Spacing getSpacing() const { return spacing; }
@ -135,8 +158,12 @@ public:
/** Returns the indent level of this Formatter. */
int getIndentLevel() const { return indent; }
/** Returns the encoding of this Formatter. */
Encoding getEncoding() const { return encoding; }
private:
Spacing spacing = Spacing::multiLine;
Encoding encoding = Encoding::utf8;
int maxDecimalPlaces = 15;
int indent = 0;
};