/* Copyright (C) 2002 The gtkmm Development Team * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library. If not, see . */ #include #include #include #include #include #include #include #include #include #include // For std::move() #ifdef HAVE_CONFIG_H #include #endif // If glibmm is built with Autotools, GLIBMM_SIZEOF_WCHAR_T is not defined and // SIZEOF_WCHAR_T is defined in config.h. // If glibmm is built with Meson, config.h does not exist and // GLIBMM_SIZEOF_WCHAR_T is defined in glibmmconfig.h. #if !defined(SIZEOF_WCHAR_T) && defined(GLIBMM_SIZEOF_WCHAR_T) #define SIZEOF_WCHAR_T GLIBMM_SIZEOF_WCHAR_T #endif namespace { using Glib::ustring; using Glib::UStringView; // Little helper to make the conversion from gunichar to UTF-8 a one-liner. // struct UnicharToUtf8 { char buf[6]; ustring::size_type len; explicit UnicharToUtf8(gunichar uc) : len(g_unichar_to_utf8(uc, buf)) {} }; // All utf8_*_offset() functions return npos if offset is out of range. // The caller should decide if npos is a valid argument and just marks // the whole string, or if it is not allowed (e.g. for start positions). // In the latter case std::out_of_range should be thrown, but usually // std::string will do that for us. // First overload: stop on '\0' character. static ustring::size_type utf8_byte_offset(const char* str, ustring::size_type offset) { if (offset == ustring::npos) return ustring::npos; const char* const utf8_skip = g_utf8_skip; const char* p = str; for (; offset != 0; --offset) { const unsigned int c = static_cast(*p); if (c == 0) return ustring::npos; p += utf8_skip[c]; } return (p - str); } // Second overload: stop when reaching maxlen. static ustring::size_type utf8_byte_offset(const char* str, ustring::size_type offset, ustring::size_type maxlen) { if (offset == ustring::npos) return ustring::npos; const char* const utf8_skip = g_utf8_skip; const char* const pend = str + maxlen; const char* p = str; for (; offset != 0; --offset) { if (p >= pend) return ustring::npos; p += utf8_skip[static_cast(*p)]; } return (p - str); } // Third overload: stop when reaching str.size(). // inline ustring::size_type utf8_byte_offset(const std::string& str, ustring::size_type offset) { return utf8_byte_offset(str.data(), offset, str.size()); } // Takes UTF-8 character offset and count in ci and cn. // Returns the byte offset and count in i and n. // struct Utf8SubstrBounds { ustring::size_type i; ustring::size_type n; Utf8SubstrBounds(const std::string& str, ustring::size_type ci, ustring::size_type cn) : i(utf8_byte_offset(str, ci)), n(ustring::npos) { if (i != ustring::npos) n = utf8_byte_offset(str.data() + i, cn, str.size() - i); } }; // Converts byte offset to UTF-8 character offset. inline ustring::size_type utf8_char_offset(const std::string& str, ustring::size_type offset) { if (offset == ustring::npos) return ustring::npos; const char* const pdata = str.data(); return g_utf8_pointer_to_offset(pdata, pdata + offset); } // Helper to implement ustring::find_first_of() and find_first_not_of(). // Returns the UTF-8 character offset, or ustring::npos if not found. static ustring::size_type utf8_find_first_of(const std::string& str, ustring::size_type offset, const char* utf8_match, long utf8_match_size, bool find_not_of) { const ustring::size_type byte_offset = utf8_byte_offset(str, offset); if (byte_offset == ustring::npos) return ustring::npos; long ucs4_match_size = 0; const auto ucs4_match = Glib::make_unique_ptr_gfree(g_utf8_to_ucs4_fast(utf8_match, utf8_match_size, &ucs4_match_size)); const gunichar* const match_begin = ucs4_match.get(); const gunichar* const match_end = match_begin + ucs4_match_size; const char* const str_begin = str.data(); const char* const str_end = str_begin + str.size(); for (const char* pstr = str_begin + byte_offset; pstr < str_end; pstr = g_utf8_next_char(pstr)) { const gunichar* const pfound = std::find(match_begin, match_end, g_utf8_get_char(pstr)); if ((pfound != match_end) != find_not_of) return offset; ++offset; } return ustring::npos; } // Helper to implement ustring::find_last_of() and find_last_not_of(). // Returns the UTF-8 character offset, or ustring::npos if not found. static ustring::size_type utf8_find_last_of(const std::string& str, ustring::size_type offset, const char* utf8_match, long utf8_match_size, bool find_not_of) { long ucs4_match_size = 0; const auto ucs4_match = Glib::make_unique_ptr_gfree(g_utf8_to_ucs4_fast(utf8_match, utf8_match_size, &ucs4_match_size)); const gunichar* const match_begin = ucs4_match.get(); const gunichar* const match_end = match_begin + ucs4_match_size; const char* const str_begin = str.data(); const char* pstr = str_begin; // Set pstr one byte beyond the actual start position. const ustring::size_type byte_offset = utf8_byte_offset(str, offset); pstr += (byte_offset < str.size()) ? byte_offset + 1 : str.size(); while (pstr > str_begin) { // Move to previous character. do --pstr; while ((static_cast(*pstr) & 0xC0u) == 0x80); const gunichar* const pfound = std::find(match_begin, match_end, g_utf8_get_char(pstr)); if ((pfound != match_end) != find_not_of) return g_utf8_pointer_to_offset(str_begin, pstr); } return ustring::npos; } } // anonymous namespace namespace Glib { #ifndef GLIBMM_HAVE_ALLOWS_STATIC_INLINE_NPOS // Initialize static member here, // because the compiler did not allow us do it inline. const ustring::size_type ustring::npos = std::string::npos; #endif /* * We need our own version of g_utf8_get_char(), because the std::string * iterator is not necessarily a plain pointer (it's in fact not in GCC's * libstdc++-v3). Copying the UTF-8 data into a temporary buffer isn't an * option since this operation is quite time critical. The implementation * is quite different from g_utf8_get_char() -- both more generic and likely * faster. * * By looking at the first byte of a UTF-8 character one can determine the * number of bytes used. GLib offers the g_utf8_skip[] array for this purpose, * but accessing this global variable would, on IA32 at least, introduce * a function call to fetch the Global Offset Table, plus two levels of * indirection in order to read the value. Even worse, fetching the GOT is * always done right at the start of the function instead of the branch that * actually uses the variable. * * Fortunately, there's a better way to get the byte count. As this table * shows, there's a nice regular pattern in the UTF-8 encoding scheme: * * 0x00000000 - 0x0000007F: 0xxxxxxx * 0x00000080 - 0x000007FF: 110xxxxx 10xxxxxx * 0x00000800 - 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx * 0x00010000 - 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx * 0x00200000 - 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx * 0x04000000 - 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx * * Except for the single byte case, the number of leading 1-bits equals the * byte count. All that is needed is to shift the first byte to the left * until bit 7 becomes 0. Naturally, doing so requires a loop -- but since * we already have one, no additional cost is introduced. This shifting can * further be combined with the computation of the bitmask needed to eliminate * the leading length bits, thus saving yet another register. * * Note: If you change this code, it is advisable to also review what the * compiler makes of it in the assembler output. Except for some pointless * register moves, the generated code is sufficiently close to the optimum * with GCC 4.1.2 on x86_64. */ gunichar get_unichar_from_std_iterator(std::string::const_iterator pos) { unsigned int result = static_cast(*pos); if ((result & 0x80) != 0) { unsigned int mask = 0x40; do { result <<= 6; const unsigned int c = static_cast(*++pos); mask <<= 5; result += c - 0x80; } while ((result & mask) != 0); result &= mask - 1; } return result; } /**** Glib::ustring ********************************************************/ ustring::ustring() : string_() { } ustring::ustring(const ustring& other) : string_(other.string_) { } ustring::ustring(ustring&& other) : string_(std::move(other.string_)) { } ustring::ustring(const ustring& src, ustring::size_type i, ustring::size_type n) : string_() { const Utf8SubstrBounds bounds(src.string_, i, n); string_.assign(src.string_, bounds.i, bounds.n); } ustring::ustring(const char* src, ustring::size_type n) : string_(src, utf8_byte_offset(src, n)) { } ustring::ustring(const char* src) : string_(src) { } ustring::ustring(ustring::size_type n, gunichar uc) : string_() { if (uc < 0x80) { // Optimize the probably most common case. string_.assign(n, static_cast(uc)); } else { const UnicharToUtf8 conv(uc); string_.reserve(n * conv.len); for (; n > 0; --n) string_.append(conv.buf, conv.len); } } ustring::ustring(ustring::size_type n, char c) : string_(n, c) { } ustring::ustring(const std::string& src) : string_(src) { } ustring::ustring(std::string&& src) : string_(std::move(src)) { } ustring::~ustring() noexcept { } void ustring::swap(ustring& other) { string_.swap(other.string_); } /**** Glib::ustring::operator=() *******************************************/ ustring& ustring::operator=(const ustring& other) { string_ = other.string_; return *this; } ustring& ustring::operator=(ustring&& other) { string_ = std::move(other.string_); return *this; } ustring& ustring::operator=(const std::string& src) { string_ = src; return *this; } ustring& ustring::operator=(std::string&& src) { string_ = std::move(src); return *this; } ustring& ustring::operator=(const char* src) { string_ = src; return *this; } ustring& ustring::operator=(gunichar uc) { const UnicharToUtf8 conv(uc); string_.assign(conv.buf, conv.len); return *this; } ustring& ustring::operator=(char c) { string_ = c; return *this; } /**** Glib::ustring::assign() **********************************************/ ustring& ustring::assign(const ustring& src) { string_ = src.string_; return *this; } ustring& ustring::assign(ustring&& src) { string_ = std::move(src.string_); return *this; } ustring& ustring::assign(const ustring& src, ustring::size_type i, ustring::size_type n) { const Utf8SubstrBounds bounds(src.string_, i, n); string_.assign(src.string_, bounds.i, bounds.n); return *this; } ustring& ustring::assign(const char* src, ustring::size_type n) { string_.assign(src, utf8_byte_offset(src, n)); return *this; } ustring& ustring::assign(const char* src) { string_ = src; return *this; } ustring& ustring::assign(ustring::size_type n, gunichar uc) { ustring temp(n, uc); string_.swap(temp.string_); return *this; } ustring& ustring::assign(ustring::size_type n, char c) { string_.assign(n, c); return *this; } /**** Glib::ustring::operator+=() ******************************************/ ustring& ustring::operator+=(const ustring& src) { string_ += src.string_; return *this; } ustring& ustring::operator+=(const char* src) { string_ += src; return *this; } ustring& ustring::operator+=(gunichar uc) { const UnicharToUtf8 conv(uc); string_.append(conv.buf, conv.len); return *this; } ustring& ustring::operator+=(char c) { string_ += c; return *this; } /**** Glib::ustring::push_back() *******************************************/ void ustring::push_back(gunichar uc) { const UnicharToUtf8 conv(uc); string_.append(conv.buf, conv.len); } void ustring::push_back(char c) { string_ += c; } /**** Glib::ustring::append() **********************************************/ ustring& ustring::append(const ustring& src) { string_ += src.string_; return *this; } ustring& ustring::append(const ustring& src, ustring::size_type i, ustring::size_type n) { const Utf8SubstrBounds bounds(src.string_, i, n); string_.append(src.string_, bounds.i, bounds.n); return *this; } ustring& ustring::append(const char* src, ustring::size_type n) { string_.append(src, utf8_byte_offset(src, n)); return *this; } ustring& ustring::append(const char* src) { string_ += src; return *this; } ustring& ustring::append(ustring::size_type n, gunichar uc) { string_.append(ustring(n, uc).string_); return *this; } ustring& ustring::append(ustring::size_type n, char c) { string_.append(n, c); return *this; } /**** Glib::ustring::insert() **********************************************/ ustring& ustring::insert(ustring::size_type i, const ustring& src) { string_.insert(utf8_byte_offset(string_, i), src.string_); return *this; } ustring& ustring::insert( ustring::size_type i, const ustring& src, ustring::size_type i2, ustring::size_type n) { const Utf8SubstrBounds bounds2(src.string_, i2, n); string_.insert(utf8_byte_offset(string_, i), src.string_, bounds2.i, bounds2.n); return *this; } ustring& ustring::insert(ustring::size_type i, const char* src, ustring::size_type n) { string_.insert(utf8_byte_offset(string_, i), src, utf8_byte_offset(src, n)); return *this; } ustring& ustring::insert(ustring::size_type i, const char* src) { string_.insert(utf8_byte_offset(string_, i), src); return *this; } ustring& ustring::insert(ustring::size_type i, ustring::size_type n, gunichar uc) { string_.insert(utf8_byte_offset(string_, i), ustring(n, uc).string_); return *this; } ustring& ustring::insert(ustring::size_type i, ustring::size_type n, char c) { string_.insert(utf8_byte_offset(string_, i), n, c); return *this; } ustring::iterator ustring::insert(ustring::iterator p, gunichar uc) { const size_type offset = p.base() - string_.begin(); const UnicharToUtf8 conv(uc); string_.insert(offset, conv.buf, conv.len); return iterator(string_.begin() + offset); } ustring::iterator ustring::insert(ustring::iterator p, char c) { return iterator(string_.insert(p.base(), c)); } void ustring::insert(ustring::iterator p, ustring::size_type n, gunichar uc) { string_.insert(p.base() - string_.begin(), ustring(n, uc).string_); } void ustring::insert(ustring::iterator p, ustring::size_type n, char c) { string_.insert(p.base(), n, c); } /**** Glib::ustring::replace() *********************************************/ ustring& ustring::replace(ustring::size_type i, ustring::size_type n, const ustring& src) { const Utf8SubstrBounds bounds(string_, i, n); string_.replace(bounds.i, bounds.n, src.string_); return *this; } ustring& ustring::replace(ustring::size_type i, ustring::size_type n, const ustring& src, ustring::size_type i2, ustring::size_type n2) { const Utf8SubstrBounds bounds(string_, i, n); const Utf8SubstrBounds bounds2(src.string_, i2, n2); string_.replace(bounds.i, bounds.n, src.string_, bounds2.i, bounds2.n); return *this; } ustring& ustring::replace(ustring::size_type i, ustring::size_type n, const char* src, ustring::size_type n2) { const Utf8SubstrBounds bounds(string_, i, n); string_.replace(bounds.i, bounds.n, src, utf8_byte_offset(src, n2)); return *this; } ustring& ustring::replace(ustring::size_type i, ustring::size_type n, const char* src) { const Utf8SubstrBounds bounds(string_, i, n); string_.replace(bounds.i, bounds.n, src); return *this; } ustring& ustring::replace(ustring::size_type i, ustring::size_type n, ustring::size_type n2, gunichar uc) { const Utf8SubstrBounds bounds(string_, i, n); string_.replace(bounds.i, bounds.n, ustring(n2, uc).string_); return *this; } ustring& ustring::replace(ustring::size_type i, ustring::size_type n, ustring::size_type n2, char c) { const Utf8SubstrBounds bounds(string_, i, n); string_.replace(bounds.i, bounds.n, n2, c); return *this; } ustring& ustring::replace(ustring::iterator pbegin, ustring::iterator pend, const ustring& src) { string_.replace(pbegin.base(), pend.base(), src.string_); return *this; } ustring& ustring::replace( ustring::iterator pbegin, ustring::iterator pend, const char* src, ustring::size_type n) { string_.replace(pbegin.base(), pend.base(), src, utf8_byte_offset(src, n)); return *this; } ustring& ustring::replace(ustring::iterator pbegin, ustring::iterator pend, const char* src) { string_.replace(pbegin.base(), pend.base(), src); return *this; } ustring& ustring::replace( ustring::iterator pbegin, ustring::iterator pend, ustring::size_type n, gunichar uc) { string_.replace(pbegin.base(), pend.base(), ustring(n, uc).string_); return *this; } ustring& ustring::replace(ustring::iterator pbegin, ustring::iterator pend, ustring::size_type n, char c) { string_.replace(pbegin.base(), pend.base(), n, c); return *this; } /**** Glib::ustring::erase() ***********************************************/ void ustring::clear() { string_.erase(); } ustring& ustring::erase(ustring::size_type i, ustring::size_type n) { const Utf8SubstrBounds bounds(string_, i, n); string_.erase(bounds.i, bounds.n); return *this; } ustring& ustring::erase() { string_.erase(); return *this; } ustring::iterator ustring::erase(ustring::iterator p) { ustring::iterator iter_end = p; ++iter_end; return iterator(string_.erase(p.base(), iter_end.base())); } ustring::iterator ustring::erase(ustring::iterator pbegin, ustring::iterator pend) { return iterator(string_.erase(pbegin.base(), pend.base())); } /**** Glib::ustring::compare() *********************************************/ int ustring::compare(UStringView rhs) const { return g_utf8_collate(string_.c_str(), rhs.c_str()); } int ustring::compare(ustring::size_type i, ustring::size_type n, UStringView rhs) const { return ustring(*this, i, n).compare(rhs.c_str()); } int ustring::compare(ustring::size_type i, ustring::size_type n, const ustring& rhs, ustring::size_type i2, ustring::size_type n2) const { return ustring(*this, i, n).compare(ustring(rhs, i2, n2)); } int ustring::compare( ustring::size_type i, ustring::size_type n, const char* rhs, ustring::size_type n2) const { return ustring(*this, i, n).compare(ustring(rhs, n2)); } /**** Glib::ustring -- index access ****************************************/ ustring::value_type ustring::operator[](ustring::size_type i) const { return g_utf8_get_char(g_utf8_offset_to_pointer(string_.data(), i)); } ustring::value_type ustring::at(ustring::size_type i) const { const size_type byte_offset = utf8_byte_offset(string_, i); // Throws std::out_of_range if the index is invalid. return g_utf8_get_char(&string_.at(byte_offset)); } /**** Glib::ustring -- iterator access *************************************/ ustring::iterator ustring::begin() { return iterator(string_.begin()); } ustring::iterator ustring::end() { return iterator(string_.end()); } ustring::const_iterator ustring::begin() const { return const_iterator(string_.begin()); } ustring::const_iterator ustring::end() const { return const_iterator(string_.end()); } ustring::reverse_iterator ustring::rbegin() { return reverse_iterator(iterator(string_.end())); } ustring::reverse_iterator ustring::rend() { return reverse_iterator(iterator(string_.begin())); } ustring::const_reverse_iterator ustring::rbegin() const { return const_reverse_iterator(const_iterator(string_.end())); } ustring::const_reverse_iterator ustring::rend() const { return const_reverse_iterator(const_iterator(string_.begin())); } ustring::const_iterator ustring::cbegin() const { return const_iterator(string_.begin()); } ustring::const_iterator ustring::cend() const { return const_iterator(string_.end()); } /**** Glib::ustring::find() ************************************************/ ustring::size_type ustring::find(const ustring& str, ustring::size_type i) const { return utf8_char_offset(string_, string_.find(str.string_, utf8_byte_offset(string_, i))); } ustring::size_type ustring::find(const char* str, ustring::size_type i, ustring::size_type n) const { return utf8_char_offset( string_, string_.find(str, utf8_byte_offset(string_, i), utf8_byte_offset(str, n))); } ustring::size_type ustring::find(const char* str, ustring::size_type i) const { return utf8_char_offset(string_, string_.find(str, utf8_byte_offset(string_, i))); } ustring::size_type ustring::find(gunichar uc, ustring::size_type i) const { const UnicharToUtf8 conv(uc); return utf8_char_offset(string_, string_.find(conv.buf, utf8_byte_offset(string_, i), conv.len)); } ustring::size_type ustring::find(char c, ustring::size_type i) const { return utf8_char_offset(string_, string_.find(c, utf8_byte_offset(string_, i))); } /**** Glib::ustring::rfind() ***********************************************/ ustring::size_type ustring::rfind(const ustring& str, ustring::size_type i) const { return utf8_char_offset(string_, string_.rfind(str.string_, utf8_byte_offset(string_, i))); } ustring::size_type ustring::rfind(const char* str, ustring::size_type i, ustring::size_type n) const { return utf8_char_offset( string_, string_.rfind(str, utf8_byte_offset(string_, i), utf8_byte_offset(str, n))); } ustring::size_type ustring::rfind(const char* str, ustring::size_type i) const { return utf8_char_offset(string_, string_.rfind(str, utf8_byte_offset(string_, i))); } ustring::size_type ustring::rfind(gunichar uc, ustring::size_type i) const { const UnicharToUtf8 conv(uc); return utf8_char_offset(string_, string_.rfind(conv.buf, utf8_byte_offset(string_, i), conv.len)); } ustring::size_type ustring::rfind(char c, ustring::size_type i) const { return utf8_char_offset(string_, string_.rfind(c, utf8_byte_offset(string_, i))); } /**** Glib::ustring::find_first_of() ***************************************/ ustring::size_type ustring::find_first_of(const ustring& match, ustring::size_type i) const { return utf8_find_first_of(string_, i, match.string_.data(), match.string_.size(), false); } ustring::size_type ustring::find_first_of(const char* match, ustring::size_type i, ustring::size_type n) const { return utf8_find_first_of(string_, i, match, n, false); } ustring::size_type ustring::find_first_of(const char* match, ustring::size_type i) const { return utf8_find_first_of(string_, i, match, -1, false); } ustring::size_type ustring::find_first_of(gunichar uc, ustring::size_type i) const { return find(uc, i); } ustring::size_type ustring::find_first_of(char c, ustring::size_type i) const { return find(c, i); } /**** Glib::ustring::find_last_of() ****************************************/ ustring::size_type ustring::find_last_of(const ustring& match, ustring::size_type i) const { return utf8_find_last_of(string_, i, match.string_.data(), match.string_.size(), false); } ustring::size_type ustring::find_last_of(const char* match, ustring::size_type i, ustring::size_type n) const { return utf8_find_last_of(string_, i, match, n, false); } ustring::size_type ustring::find_last_of(const char* match, ustring::size_type i) const { return utf8_find_last_of(string_, i, match, -1, false); } ustring::size_type ustring::find_last_of(gunichar uc, ustring::size_type i) const { return rfind(uc, i); } ustring::size_type ustring::find_last_of(char c, ustring::size_type i) const { return rfind(c, i); } /**** Glib::ustring::find_first_not_of() ***********************************/ ustring::size_type ustring::find_first_not_of(const ustring& match, ustring::size_type i) const { return utf8_find_first_of(string_, i, match.string_.data(), match.string_.size(), true); } ustring::size_type ustring::find_first_not_of(const char* match, ustring::size_type i, ustring::size_type n) const { return utf8_find_first_of(string_, i, match, n, true); } ustring::size_type ustring::find_first_not_of(const char* match, ustring::size_type i) const { return utf8_find_first_of(string_, i, match, -1, true); } // Unfortunately, all of the find_*_not_of() methods for single // characters need their own special implementation. // ustring::size_type ustring::find_first_not_of(gunichar uc, ustring::size_type i) const { const size_type bi = utf8_byte_offset(string_, i); if (bi != npos) { const char* const pbegin = string_.data(); const char* const pend = pbegin + string_.size(); for (const char *p = pbegin + bi; p < pend; p = g_utf8_next_char(p), ++i) { if (g_utf8_get_char(p) != uc) return i; } } return npos; } ustring::size_type ustring::find_first_not_of(char c, ustring::size_type i) const { const size_type bi = utf8_byte_offset(string_, i); if (bi != npos) { const char* const pbegin = string_.data(); const char* const pend = pbegin + string_.size(); for (const char *p = pbegin + bi; p < pend; p = g_utf8_next_char(p), ++i) { if (*p != c) return i; } } return npos; } /**** Glib::ustring::find_last_not_of() ************************************/ ustring::size_type ustring::find_last_not_of(const ustring& match, ustring::size_type i) const { return utf8_find_last_of(string_, i, match.string_.data(), match.string_.size(), true); } ustring::size_type ustring::find_last_not_of(const char* match, ustring::size_type i, ustring::size_type n) const { return utf8_find_last_of(string_, i, match, n, true); } ustring::size_type ustring::find_last_not_of(const char* match, ustring::size_type i) const { return utf8_find_last_of(string_, i, match, -1, true); } // Unfortunately, all of the find_*_not_of() methods for single // characters need their own special implementation. // ustring::size_type ustring::find_last_not_of(gunichar uc, ustring::size_type i) const { const char* const pbegin = string_.data(); const char* const pend = pbegin + string_.size(); size_type i_cur = 0; size_type i_found = npos; for (const char *p = pbegin; p < pend && i_cur <= i; p = g_utf8_next_char(p), ++i_cur) { if (g_utf8_get_char(p) != uc) i_found = i_cur; } return i_found; } ustring::size_type ustring::find_last_not_of(char c, ustring::size_type i) const { const char* const pbegin = string_.data(); const char* const pend = pbegin + string_.size(); size_type i_cur = 0; size_type i_found = npos; for (const char *p = pbegin; p < pend && i_cur <= i; p = g_utf8_next_char(p), ++i_cur) { if (*p != c) i_found = i_cur; } return i_found; } /**** Glib::ustring -- get size and resize *********************************/ bool ustring::empty() const { return string_.empty(); } ustring::size_type ustring::size() const { const char* const pdata = string_.data(); return g_utf8_pointer_to_offset(pdata, pdata + string_.size()); } ustring::size_type ustring::length() const { const char* const pdata = string_.data(); return g_utf8_pointer_to_offset(pdata, pdata + string_.size()); } ustring::size_type ustring::bytes() const { return string_.size(); } ustring::size_type ustring::capacity() const { return string_.capacity(); } ustring::size_type ustring::max_size() const { return string_.max_size(); } void ustring::resize(ustring::size_type n, gunichar uc) { const size_type size_now = size(); if (n < size_now) erase(n, npos); else if (n > size_now) append(n - size_now, uc); } void ustring::resize(ustring::size_type n, char c) { const size_type size_now = size(); if (n < size_now) erase(n, npos); else if (n > size_now) string_.append(n - size_now, c); } void ustring::reserve(ustring::size_type n) { string_.reserve(n); } /**** Glib::ustring -- C string access *************************************/ const char* ustring::data() const { return string_.data(); } const char* ustring::c_str() const { return string_.c_str(); } // Note that copy() requests UTF-8 character offsets as // parameters, but returns the number of copied bytes. // ustring::size_type ustring::copy(char* dest, ustring::size_type n, ustring::size_type i) const { const Utf8SubstrBounds bounds(string_, i, n); return string_.copy(dest, bounds.n, bounds.i); } /**** Glib::ustring -- UTF-8 utilities *************************************/ bool ustring::validate() const { return (g_utf8_validate(string_.data(), string_.size(), nullptr) != 0); } bool ustring::validate(ustring::iterator& first_invalid) { const char* const pdata = string_.data(); const char* valid_end = pdata; const int is_valid = g_utf8_validate(pdata, string_.size(), &valid_end); first_invalid = iterator(string_.begin() + (valid_end - pdata)); return (is_valid != 0); } bool ustring::validate(ustring::const_iterator& first_invalid) const { const char* const pdata = string_.data(); const char* valid_end = pdata; const int is_valid = g_utf8_validate(pdata, string_.size(), &valid_end); first_invalid = const_iterator(string_.begin() + (valid_end - pdata)); return (is_valid != 0); } ustring ustring::make_valid() const { return convert_return_gchar_ptr_to_ustring(g_utf8_make_valid(string_.data(), string_.size())); } bool ustring::is_ascii() const { const char* p = string_.data(); const char* const pend = p + string_.size(); for (; p != pend; ++p) { if ((static_cast(*p) & 0x80u) != 0) return false; } return true; } ustring ustring::normalize(NormalizeMode mode) const { return convert_return_gchar_ptr_to_ustring( g_utf8_normalize(string_.data(), string_.size(), static_cast(int(mode)))); } ustring ustring::uppercase() const { return convert_return_gchar_ptr_to_ustring(g_utf8_strup(string_.data(), string_.size())); } ustring ustring::lowercase() const { return convert_return_gchar_ptr_to_ustring(g_utf8_strdown(string_.data(), string_.size())); } ustring ustring::casefold() const { return convert_return_gchar_ptr_to_ustring(g_utf8_casefold(string_.data(), string_.size())); } std::string ustring::collate_key() const { return convert_return_gchar_ptr_to_stdstring(g_utf8_collate_key(string_.data(), string_.size())); } std::string ustring::casefold_collate_key() const { char* const casefold_buf = g_utf8_casefold(string_.data(), string_.size()); char* const key_buf = g_utf8_collate_key(casefold_buf, -1); g_free(casefold_buf); return std::string(make_unique_ptr_gfree(key_buf).get()); } /**** Glib::ustring -- Message formatting **********************************/ // static ustring ustring::compose_private(const Glib::ustring& fmt, std::initializer_list const ilist) { std::string::size_type result_size = fmt.raw().size(); // Guesstimate the final string size. for (auto const it: ilist) result_size += it->raw().size(); std::string result; result.reserve(result_size); const char* const pfmt = fmt.raw().c_str(); const char* start = pfmt; while (const char* const stop = std::strchr(start, '%')) { if (stop[1] == '%') { result.append(start, stop - start + 1); start = stop + 2; } else { const int index = Ascii::digit_value(stop[1]) - 1; const int size = ilist.size(); if (index >= 0 && index < size) { result.append(start, stop - start); result += (*(ilist.begin() + index))->raw(); start = stop + 2; } else { const char* const next = (stop[1] != '\0') ? g_utf8_next_char(stop + 1) : (stop + 1); // Copy invalid substitutions literally to the output. result.append(start, next - start); g_warning("invalid substitution \"%s\" in fmt string \"%s\"", result.c_str() + result.size() - (next - stop), pfmt); start = next; } } } result.append(start, pfmt + fmt.raw().size() - start); return result; } /**** Glib::ustring::SequenceToString **************************************/ ustring::SequenceToString::SequenceToString( Glib::ustring::iterator pbegin, Glib::ustring::iterator pend) : std::string(pbegin.base(), pend.base()) { } ustring::SequenceToString::SequenceToString( Glib::ustring::const_iterator pbegin, Glib::ustring::const_iterator pend) : std::string(pbegin.base(), pend.base()) { } /**** Glib::ustring::FormatStream ******************************************/ ustring::FormatStream::FormatStream() : stream_() { } ustring::FormatStream::~FormatStream() noexcept { } ustring ustring::FormatStream::to_string() const { GError* error = nullptr; #ifdef GLIBMM_HAVE_WIDE_STREAM const std::wstring str = stream_.str(); #if (defined(__STDC_ISO_10646__) || defined(_LIBCPP_VERSION)) && SIZEOF_WCHAR_T == 4 // Avoid going through iconv if wchar_t always contains UCS-4. glong n_bytes = 0; const auto buf = make_unique_ptr_gfree(g_ucs4_to_utf8( reinterpret_cast(str.data()), str.size(), nullptr, &n_bytes, &error)); #elif defined(G_OS_WIN32) && SIZEOF_WCHAR_T == 2 // Avoid going through iconv if wchar_t always contains UTF-16. glong n_bytes = 0; const auto buf = make_unique_ptr_gfree(g_utf16_to_utf8( reinterpret_cast(str.data()), str.size(), nullptr, &n_bytes, &error)); #else gsize n_bytes = 0; const auto buf = make_unique_ptr_gfree(g_convert(reinterpret_cast(str.data()), str.size() * sizeof(std::wstring::value_type), "UTF-8", "WCHAR_T", nullptr, &n_bytes, &error)); #endif /* !(__STDC_ISO_10646__ || G_OS_WIN32) */ #else /* !GLIBMM_HAVE_WIDE_STREAM */ const std::string str = stream_.str(); gsize n_bytes = 0; const auto buf = make_unique_ptr_gfree(g_locale_to_utf8(str.data(), str.size(), 0, &n_bytes, &error)); #endif /* !GLIBMM_HAVE_WIDE_STREAM */ if (error) { Glib::Error::throw_exception(error); } return ustring(buf.get(), buf.get() + n_bytes); } /**** Glib::ustring -- stream I/O operators ********************************/ std::istream& operator>>(std::istream& is, Glib::ustring& utf8_string) { std::string str; is >> str; GError* error = nullptr; gsize n_bytes = 0; const auto buf = make_unique_ptr_gfree(g_locale_to_utf8(str.data(), str.size(), nullptr, &n_bytes, &error)); if (error) { Glib::Error::throw_exception(error); } utf8_string.assign(buf.get(), buf.get() + n_bytes); return is; } std::ostream& operator<<(std::ostream& os, const Glib::ustring& utf8_string) { GError* error = nullptr; const auto buf = make_unique_ptr_gfree(g_locale_from_utf8( utf8_string.raw().data(), utf8_string.raw().size(), nullptr, nullptr, &error)); if (error) { Glib::Error::throw_exception(error); } // This won't work if the string contains NUL characters. Unfortunately, // std::ostream::write() ignores format flags, so we cannot use that. // The only option would be to create a temporary std::string. However, // even then GCC's libstdc++-v3 prints only the characters up to the first // NUL. Given this, there doesn't seem much of a point in allowing NUL in // formatted output. The semantics would be unclear anyway: what's the // screen width of a NUL? os << buf.get(); return os; } #ifdef GLIBMM_HAVE_WIDE_STREAM std::wistream& operator>>(std::wistream& is, ustring& utf8_string) { GError* error = nullptr; std::wstring wstr; is >> wstr; #if (defined(__STDC_ISO_10646__) || defined(_LIBCPP_VERSION)) && SIZEOF_WCHAR_T == 4 // Avoid going through iconv if wchar_t always contains UCS-4. glong n_bytes = 0; const auto buf = make_unique_ptr_gfree(g_ucs4_to_utf8( reinterpret_cast(wstr.data()), wstr.size(), nullptr, &n_bytes, &error)); #elif defined(G_OS_WIN32) && SIZEOF_WCHAR_T == 2 // Avoid going through iconv if wchar_t always contains UTF-16. glong n_bytes = 0; const auto buf = make_unique_ptr_gfree(g_utf16_to_utf8( reinterpret_cast(wstr.data()), wstr.size(), nullptr, &n_bytes, &error)); #else gsize n_bytes = 0; const auto buf = make_unique_ptr_gfree(g_convert(reinterpret_cast(wstr.data()), wstr.size() * sizeof(std::wstring::value_type), "UTF-8", "WCHAR_T", nullptr, &n_bytes, &error)); #endif // !(__STDC_ISO_10646__ || G_OS_WIN32) if (error) { Glib::Error::throw_exception(error); } utf8_string.assign(buf.get(), buf.get() + n_bytes); return is; } std::wostream& operator<<(std::wostream& os, const ustring& utf8_string) { GError* error = nullptr; #if (defined(__STDC_ISO_10646__) || defined(_LIBCPP_VERSION)) && SIZEOF_WCHAR_T == 4 // Avoid going through iconv if wchar_t always contains UCS-4. const auto buf = make_unique_ptr_gfree( g_utf8_to_ucs4(utf8_string.raw().data(), utf8_string.raw().size(), nullptr, nullptr, &error)); #elif defined(G_OS_WIN32) && SIZEOF_WCHAR_T == 2 // Avoid going through iconv if wchar_t always contains UTF-16. const auto buf = make_unique_ptr_gfree( g_utf8_to_utf16(utf8_string.raw().data(), utf8_string.raw().size(), nullptr, nullptr, &error)); #else const auto buf = make_unique_ptr_gfree(g_convert(utf8_string.raw().data(), utf8_string.raw().size(), "WCHAR_T", "UTF-8", nullptr, nullptr, &error)); #endif // !(__STDC_ISO_10646__ || G_OS_WIN32) if (error) { Glib::Error::throw_exception(error); } // This won't work if the string contains NUL characters. Unfortunately, // std::wostream::write() ignores format flags, so we cannot use that. // The only option would be to create a temporary std::wstring. However, // even then GCC's libstdc++-v3 prints only the characters up to the first // NUL. Given this, there doesn't seem much of a point in allowing NUL in // formatted output. The semantics would be unclear anyway: what's the // screen width of a NUL? os << reinterpret_cast(buf.get()); return os; } #endif /* GLIBMM_HAVE_WIDE_STREAM */ } // namespace Glib