// // ██████ ██  ██  ██████  ██████ // ██      ██  ██ ██    ██ ██       ** Classy Header-Only Classes ** // ██  ███████ ██  ██ ██ // ██  ██   ██ ██  ██ ██ https://github.com/Tracktion/choc //  ██████ ██  ██  ██████   ██████ // // CHOC is (C)2022 Tracktion Corporation, and is offered under the terms of the ISC license: // // Permission to use, copy, modify, and/or distribute this software for any purpose with or // without fee is hereby granted, provided that the above copyright notice and this permission // notice appear in all copies. THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL // WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY // AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR // CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, // WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. #ifndef CHOC_UTF8_HEADER_INCLUDED #define CHOC_UTF8_HEADER_INCLUDED #include #include "choc_StringUtilities.h" namespace choc::text { /// An integer type to represent a unicode code-point. using UnicodeChar = uint32_t; //============================================================================== /** A non-owning pointer which can iterate over a chunk of null-terminated UTF-8 text and read it as wide unicode characters. */ struct UTF8Pointer { explicit constexpr UTF8Pointer (const char* utf8Text) noexcept : text (utf8Text) {} UTF8Pointer() = default; UTF8Pointer (const UTF8Pointer&) = default; UTF8Pointer& operator= (const UTF8Pointer&) = default; /// Returns the raw data that this points to. const char* data() const noexcept { return text; } /// Returns true if the pointer is not null. operator bool() const noexcept { return text != nullptr; } /// Returns true if the pointer is either null or points to a null terminator char. bool empty() const { return text == nullptr || *text == 0; } /// Returns the length by iterating all unicode chars and counting them. /// Note that this is slow, and is not a count of the number of bytes in the string! size_t length() const; //============================================================================== /// Returns the first unicode character in the string. UnicodeChar operator*() const; /// Skips past the first unicode character. /// Moving beyond the end of the string is undefined behaviour and will trigger an assertion. UTF8Pointer& operator++(); /// Skips past the first unicode character. /// Moving beyond the end of the string is undefined behaviour and will trigger an assertion. UTF8Pointer operator++ (int); /// Moves backwards to the previous unicode character. /// Moving beyond the end of the string is undefined behaviour. UTF8Pointer operator--(); /// Skips past the given number of unicode characters. /// Moving beyond the end of the string is undefined behaviour and will trigger an assertion. UTF8Pointer& operator+= (size_t numCharsToSkip); /// Returns a pointer which points to the n-th unicode character in the text /// Reading beyond the end of the string is undefined behaviour and may trigger an assertion. UTF8Pointer operator+ (size_t numCharsToSkip) const; /// Returns a pointer which points to the n-th unicode character in the text. /// Reading beyond the end of the string is undefined behaviour and may trigger an assertion. UTF8Pointer operator+ (int numCharsToSkip) const; /// Skips past the first unicode character and returns it as a code-point. /// Calling this when the current character is the terminator will leave the pointer in an /// invalid state. UnicodeChar popFirstChar(); /// Finds the next occurrence of the given string, or return a nullptr if not found. UTF8Pointer find (const char* textToFind) const; /// Returns true if the text starts with this string bool startsWith (const char* textToMatch) const; /// If the first character matches the given one, this will advance the pointer and return true. bool skipIfStartsWith (char charToMatch); /// If the start of the text matches the given string, this will advance this pointer to skip /// past it, and return true. If not, it will return false without modifying this pointer. bool skipIfStartsWith (const char* textToMatch); /// Returns a pointer to the first non-whitespace character in the given string (which may /// be the terminating null character if it's all whitespace). [[nodiscard]] UTF8Pointer findEndOfWhitespace() const; /// Iterates backwards from this position to find the first character that follows /// a new-line. The pointer provided marks the furthest back that the function should search [[nodiscard]] UTF8Pointer findStartOfLine (UTF8Pointer startOfValidText) const; /// Searches forwards for the next character that is followed by a new-line or a null-terminator. [[nodiscard]] UTF8Pointer findEndOfLine() const; //============================================================================== struct EndIterator {}; struct Iterator { explicit constexpr Iterator (const char* t) : text (t) {} Iterator (const Iterator&) = default; Iterator& operator= (const Iterator&) = default; UnicodeChar operator*() const { return *UTF8Pointer (text); } Iterator& operator++() { UTF8Pointer p (text); ++p; text = p.text; return *this; } Iterator operator++ (int) { auto old = *this; ++*this; return old; } bool operator== (EndIterator) const { return *text == 0; } bool operator!= (EndIterator) const { return *text != 0; } private: const char* text; }; Iterator begin() const; EndIterator end() const; //============================================================================== /// This does a pointer comparison, NOT a comparison of the text itself! bool operator== (UTF8Pointer other) const noexcept { return text == other.text; } /// This does a pointer comparison, NOT a comparison of the text itself! bool operator!= (UTF8Pointer other) const noexcept { return text != other.text; } /// This does a pointer comparison, NOT a comparison of the text itself! bool operator< (UTF8Pointer other) const noexcept { return text < other.text; } /// This does a pointer comparison, NOT a comparison of the text itself! bool operator> (UTF8Pointer other) const noexcept { return text > other.text; } /// This does a pointer comparison, NOT a comparison of the text itself! bool operator<= (UTF8Pointer other) const noexcept { return text <= other.text; } /// This does a pointer comparison, NOT a comparison of the text itself! bool operator>= (UTF8Pointer other) const noexcept { return text >= other.text; } bool operator== (decltype(nullptr)) const noexcept { return text == nullptr; } bool operator!= (decltype(nullptr)) const noexcept { return text != nullptr; } private: const char* text = nullptr; }; //============================================================================== /// Checks a given chunk of data to see whether it's valid UTF-8. /// If no errors are found, this returns nullptr. If an error is found, it returns the address /// of the offending byte. Note that zero bytes in the data are considered to be valid UTF-8. const char* findInvalidUTF8Data (const void* dataToCheck, size_t numBytesToRead); /// Writes the bytes for a unicode character, and returns the number of bytes that were needed. /// The buffer passed in needs to have at least 4 bytes capacity. uint32_t convertUnicodeCodepointToUTF8 (char* dest, UnicodeChar codepoint); /// Appends a unicode codepoint to a std::string as a sequence of UTF-8 bytes. void appendUTF8 (std::string& target, UnicodeChar codepoint); /// Checks whether a given codepoint is a high-surrogate bool isUnicodeHighSurrogate (UnicodeChar codepoint); /// Checks whether a given codepoint is a low-surrogate bool isUnicodeLowSurrogate (UnicodeChar codepoint); struct SurrogatePair { UnicodeChar high = 0, low = 0; }; /// For a codepoint >= 0x10000, this will return a surrogate pair to represent it. SurrogatePair splitCodePointIntoSurrogatePair (UnicodeChar fullCodePoint); /// Combines a high and low surrogate into a single codepoint. UnicodeChar createUnicodeFromHighAndLowSurrogates (SurrogatePair); /// Checks a UTF-8/CESU-8 string to see if it contains any surrogate pairs. /// If it does, then to use it as UTF-8 you'll probably need to run it through /// convertSurrogatePairsToUTF8(). bool containsSurrogatePairs (UTF8Pointer); /// Returns a string where any surrogate pairs have been converted to UTF-8 codepoints. std::string convertSurrogatePairsToUTF8 (UTF8Pointer); /// Returns true if the given UTF-8 string can be used as CESU-8 without conversion. If not, /// you'll need to run it through convertUTF8ToCESU8() to convert the 32-bit code-points /// to surrogate pairs. bool isValidCESU8 (std::string_view utf8); /// Converts any 32-bit characters in this UTF-8 string to surrogate pairs, which makes /// the resulting string suitable for use at CESU-8. [[nodiscard]] std::string convertUTF8ToCESU8 (UTF8Pointer); //============================================================================== /// Represents a line and column index within a block of text. struct LineAndColumn { /// Valid line and column values start at 1. /// If either is 0, it means that the LineAndColumn object is uninitialised. size_t line = 0, column = 0; /// Returns true if neither the line nor column is zero. bool isValid() const noexcept { return line != 0 && column != 0; } /// Turns this location into a [line]:[col] string suitable for use in a /// standard compiler error message format. std::string toString() const; }; /// Given a block of text and a position within it, this will work out the /// line and column of that position. LineAndColumn findLineAndColumn (UTF8Pointer fullText, UTF8Pointer targetPosition); //============================================================================== // _ _ _ _ // __| | ___ | |_ __ _ (_)| | ___ // / _` | / _ \| __| / _` || || |/ __| // | (_| || __/| |_ | (_| || || |\__ \ _ _ _ // \__,_| \___| \__| \__,_||_||_||___/(_)(_)(_) // // Code beyond this point is implementation detail... // //============================================================================== inline size_t UTF8Pointer::length() const { size_t count = 0; if (text != nullptr) for (auto p = *this; *p.text != 0; ++p) ++count; return count; } inline const char* findInvalidUTF8Data (const void* dataToCheck, size_t numBytes) { CHOC_ASSERT (dataToCheck != nullptr); auto source = static_cast (dataToCheck); const auto end = source + numBytes; for (;;) { if (source >= end) return nullptr; auto byte = static_cast (*source); if (byte >= 0) { ++source; continue; } int testBit = 0x40, numExtraBytes = 0; while ((byte & testBit) != 0) { testBit >>= 1; ++numExtraBytes; if (numExtraBytes > 3 || source + static_cast (numExtraBytes) >= end || (numExtraBytes == 3 && *UTF8Pointer (source) > 0x10ffff)) { return source; } } if (numExtraBytes == 0) return source; ++source; for (int i = 0; i < numExtraBytes; ++i) { if ((*source & 0xc0) != 0x80) return source; ++source; } } } inline UnicodeChar UTF8Pointer::operator*() const { return UTF8Pointer (*this).popFirstChar(); } inline UTF8Pointer& UTF8Pointer::operator++() { CHOC_ASSERT (! empty()); // can't advance past the zero-terminator auto firstByte = static_cast (*text++); if (firstByte >= 0) return *this; uint32_t testBit = 0x40, unicodeChar = static_cast (firstByte); while ((unicodeChar & testBit) != 0 && testBit > 8) { ++text; testBit >>= 1; } return *this; } inline UTF8Pointer UTF8Pointer::operator++ (int) { auto prev = *this; operator++(); return prev; } inline UTF8Pointer UTF8Pointer::operator--() { CHOC_ASSERT (text != nullptr); // mustn't use this on nullptrs uint32_t bytesSkipped = 0; while ((*--text & 0xc0) == 0x80) { if (bytesSkipped > 2) { CHOC_ASSERT (bytesSkipped <= 2); break; } ++bytesSkipped; } return *this; } inline UTF8Pointer& UTF8Pointer::operator+= (size_t numCharsToSkip) { while (numCharsToSkip != 0) { --numCharsToSkip; operator++(); } return *this; } inline UTF8Pointer UTF8Pointer::operator+ (size_t numCharsToSkip) const { auto p = *this; p += numCharsToSkip; return p; } inline UTF8Pointer UTF8Pointer::operator+ (int numCharsToSkip) const { CHOC_ASSERT (numCharsToSkip >= 0); return operator+ (static_cast (numCharsToSkip)); } inline UnicodeChar UTF8Pointer::popFirstChar() { CHOC_ASSERT (text != nullptr); // mustn't use this on nullptrs auto firstByte = static_cast (*text++); UnicodeChar unicodeChar = static_cast (firstByte); if (firstByte < 0) { uint32_t bitMask = 0x7f, numExtraBytes = 0; for (uint32_t testBit = 0x40; (unicodeChar & testBit) != 0 && testBit > 8; ++numExtraBytes) { bitMask >>= 1; testBit >>= 1; } unicodeChar &= bitMask; for (uint32_t i = 0; i < numExtraBytes; ++i) { uint32_t nextByte = static_cast (*text); CHOC_ASSERT ((nextByte & 0xc0) == 0x80); // error in the data - you should always make sure the source // gets validated before iterating a UTF8Pointer over it unicodeChar = (unicodeChar << 6) | (nextByte & 0x3f); ++text; } } return unicodeChar; } inline bool UTF8Pointer::startsWith (const char* textToMatch) const { CHOC_ASSERT (textToMatch != nullptr); if (auto p = text) { while (*textToMatch != 0) if (*textToMatch++ != *p++) return false; return true; } return false; } inline UTF8Pointer UTF8Pointer::find (const char* textToFind) const { CHOC_ASSERT (textToFind != nullptr); for (auto t = *this;; ++t) if (t.startsWith (textToFind) || t.empty()) return t; } inline bool UTF8Pointer::skipIfStartsWith (char charToMatch) { if (text != nullptr && *text == charToMatch && charToMatch != 0) { ++text; return true; } return false; } inline bool UTF8Pointer::skipIfStartsWith (const char* textToMatch) { CHOC_ASSERT (textToMatch != nullptr); if (auto p = text) { while (*textToMatch != 0) if (*textToMatch++ != *p++) return false; text = p; return true; } return false; } inline UTF8Pointer UTF8Pointer::findEndOfWhitespace() const { auto p = *this; if (p.text != nullptr) while (choc::text::isWhitespace (*p.text)) ++p; return p; } inline UTF8Pointer UTF8Pointer::findStartOfLine (UTF8Pointer start) const { if (text == nullptr) return {}; auto l = *this; CHOC_ASSERT (l.text >= start.text && start.text != nullptr); while (l.text > start.text) { auto prev = l; auto c = *--prev; if (c == '\r' || c == '\n') break; l = prev; } return l; } inline UTF8Pointer UTF8Pointer::findEndOfLine() const { if (text == nullptr) return {}; auto l = *this; while (! l.empty()) { auto c = l.popFirstChar(); if (c == '\r' || c == '\n') break; } return l; } inline UTF8Pointer::Iterator UTF8Pointer::begin() const { CHOC_ASSERT (text != nullptr); return Iterator (text); } inline UTF8Pointer::EndIterator UTF8Pointer::end() const { return EndIterator(); } inline LineAndColumn findLineAndColumn (UTF8Pointer start, UTF8Pointer targetPosition) { if (start == nullptr || targetPosition == nullptr) return {}; CHOC_ASSERT (start <= targetPosition); LineAndColumn lc { 1, 1 }; while (start < targetPosition && ! start.empty()) { ++lc.column; if (*start++ == '\n') { lc.line++; lc.column = 1; } } return lc; } inline std::string LineAndColumn::toString() const { return std::to_string (line) + ':' + std::to_string (column); } //============================================================================== inline uint32_t convertUnicodeCodepointToUTF8 (char* dest, UnicodeChar unicodeChar) { if (unicodeChar < 0x80) { *dest = static_cast (unicodeChar); return 1; } uint32_t extraBytes = 1; if (unicodeChar >= 0x800) { ++extraBytes; if (unicodeChar >= 0x10000) ++extraBytes; } dest[0] = static_cast ((0xffu << (7 - extraBytes)) | (unicodeChar >> (extraBytes * 6))); for (uint32_t i = 1; i <= extraBytes; ++i) dest[i] = static_cast (0x80u | (0x3fu & (unicodeChar >> ((extraBytes - i) * 6)))); return extraBytes + 1; } inline void appendUTF8 (std::string& target, UnicodeChar unicodeChar) { char bytes[4]; auto num = convertUnicodeCodepointToUTF8 (bytes, unicodeChar); target.append (bytes, num); } inline bool isUnicodeHighSurrogate (UnicodeChar codepoint) { return codepoint >= 0xd800 && codepoint <= 0xdbff; } inline bool isUnicodeLowSurrogate (UnicodeChar codepoint) { return codepoint >= 0xdc00 && codepoint <= 0xdfff; } inline UnicodeChar createUnicodeFromHighAndLowSurrogates (SurrogatePair pair) { if (! isUnicodeHighSurrogate (pair.high)) return pair.high; if (! isUnicodeLowSurrogate (pair.low)) return 0; return (pair.high << 10) + pair.low - 0x35fdc00u; } inline bool containsSurrogatePairs (UTF8Pointer text) { for (;;) { auto c = text.popFirstChar(); if (c == 0) return false; if (isUnicodeHighSurrogate (c)) return true; } } inline std::string convertSurrogatePairsToUTF8 (UTF8Pointer text) { std::string result; for (;;) { auto c = text.popFirstChar(); if (choc::text::isUnicodeHighSurrogate (c)) c = createUnicodeFromHighAndLowSurrogates ({ c, text.popFirstChar() }); if (c == 0) return result; appendUTF8 (result, c); } } inline SurrogatePair splitCodePointIntoSurrogatePair (UnicodeChar fullCodePoint) { CHOC_ASSERT (fullCodePoint >= 0x10000); return { static_cast (0xd800u + ((fullCodePoint - 0x10000u) >> 10)), static_cast (0xdc00u + (fullCodePoint & 0x3ffu)) }; } inline bool isValidCESU8 (std::string_view utf8) { for (auto c : utf8) if (static_cast (c) >= 0xe8) return false; return true; } inline std::string convertUTF8ToCESU8 (UTF8Pointer utf8) { std::string result; for (;;) { auto c = utf8.popFirstChar(); if (c == 0) return result; if (c < 128) { result += (char) c; } else if (c >= 0x10000) { auto pair = splitCodePointIntoSurrogatePair (c); appendUTF8 (result, pair.high); appendUTF8 (result, pair.low); } else { appendUTF8 (result, c); } } } } // namespace choc::text #endif