| 1 | /*============================================================================= |
| 2 | Copyright (c) 2001-2011 Joel de Guzman |
| 3 | Copyright (c) 2023 Nikita Kniazev |
| 4 | |
| 5 | Distributed under the Boost Software License, Version 1.0. (See accompanying |
| 6 | file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) |
| 7 | ==============================================================================*/ |
| 8 | #if !defined(BOOST_SPIRIT_UC_TYPES_NOVEMBER_23_2008_0840PM) |
| 9 | #define BOOST_SPIRIT_UC_TYPES_NOVEMBER_23_2008_0840PM |
| 10 | |
| 11 | #if defined(_MSC_VER) |
| 12 | #pragma once |
| 13 | #endif |
| 14 | |
| 15 | #include <boost/config.hpp> |
| 16 | #include <boost/cstdint.hpp> |
| 17 | #include <boost/type_traits/make_unsigned.hpp> |
| 18 | #include <string> |
| 19 | |
| 20 | namespace boost { namespace spirit |
| 21 | { |
| 22 | typedef ::boost::uint32_t ucs4_char; |
| 23 | typedef char utf8_char; |
| 24 | typedef std::basic_string<ucs4_char> ucs4_string; |
| 25 | typedef std::basic_string<utf8_char> utf8_string; |
| 26 | |
| 27 | namespace detail { |
| 28 | inline void utf8_put_encode(utf8_string& out, ucs4_char x) |
| 29 | { |
| 30 | // https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf D90 |
| 31 | if (BOOST_UNLIKELY(x > 0x10FFFFul || (0xD7FFul < x && x < 0xE000ul))) |
| 32 | x = 0xFFFDul; |
| 33 | |
| 34 | // Table 3-6. UTF-8 Bit Distribution |
| 35 | if (x < 0x80ul) { |
| 36 | out.push_back(c: static_cast<unsigned char>(x)); |
| 37 | } |
| 38 | else if (x < 0x800ul) { |
| 39 | out.push_back(c: static_cast<unsigned char>(0xC0ul + (x >> 6))); |
| 40 | out.push_back(c: static_cast<unsigned char>(0x80ul + (x & 0x3Ful))); |
| 41 | } |
| 42 | else if (x < 0x10000ul) { |
| 43 | out.push_back(c: static_cast<unsigned char>(0xE0ul + (x >> 12))); |
| 44 | out.push_back(c: static_cast<unsigned char>(0x80ul + ((x >> 6) & 0x3Ful))); |
| 45 | out.push_back(c: static_cast<unsigned char>(0x80ul + (x & 0x3Ful))); |
| 46 | } |
| 47 | else { |
| 48 | out.push_back(c: static_cast<unsigned char>(0xF0ul + (x >> 18))); |
| 49 | out.push_back(c: static_cast<unsigned char>(0x80ul + ((x >> 12) & 0x3Ful))); |
| 50 | out.push_back(c: static_cast<unsigned char>(0x80ul + ((x >> 6) & 0x3Ful))); |
| 51 | out.push_back(c: static_cast<unsigned char>(0x80ul + (x & 0x3Ful))); |
| 52 | } |
| 53 | } |
| 54 | } |
| 55 | |
| 56 | template <typename Char> |
| 57 | inline utf8_string to_utf8(Char value) |
| 58 | { |
| 59 | utf8_string result; |
| 60 | typedef typename make_unsigned<Char>::type UChar; |
| 61 | detail::utf8_put_encode(out&: result, x: static_cast<UChar>(value)); |
| 62 | return result; |
| 63 | } |
| 64 | |
| 65 | template <typename Char> |
| 66 | inline utf8_string to_utf8(Char const* str) |
| 67 | { |
| 68 | utf8_string result; |
| 69 | typedef typename make_unsigned<Char>::type UChar; |
| 70 | while (*str) |
| 71 | detail::utf8_put_encode(out&: result, x: static_cast<UChar>(*str++)); |
| 72 | return result; |
| 73 | } |
| 74 | |
| 75 | template <typename Char, typename Traits, typename Allocator> |
| 76 | inline utf8_string |
| 77 | to_utf8(std::basic_string<Char, Traits, Allocator> const& str) |
| 78 | { |
| 79 | utf8_string result; |
| 80 | typedef typename make_unsigned<Char>::type UChar; |
| 81 | for (Char const* ptr = str.data(), |
| 82 | * end = ptr + str.size(); ptr < end; ++ptr) |
| 83 | detail::utf8_put_encode(out&: result, x: static_cast<UChar>(*ptr)); |
| 84 | return result; |
| 85 | } |
| 86 | |
| 87 | // Assume wchar_t content is UTF-16 on MSVC, or mingw/wineg++ with -fshort-wchar |
| 88 | #if defined(_MSC_VER) || defined(__SIZEOF_WCHAR_T__) && __SIZEOF_WCHAR_T__ == 2 |
| 89 | inline utf8_string to_utf8(wchar_t value) |
| 90 | { |
| 91 | utf8_string result; |
| 92 | detail::utf8_put_encode(result, static_cast<make_unsigned<wchar_t>::type>(value)); |
| 93 | return result; |
| 94 | } |
| 95 | |
| 96 | namespace detail { |
| 97 | inline ucs4_char decode_utf16(wchar_t const*& s) |
| 98 | { |
| 99 | typedef make_unsigned<wchar_t>::type uwchar_t; |
| 100 | |
| 101 | uwchar_t x(*s); |
| 102 | if (x < 0xD800ul || x > 0xDFFFul) |
| 103 | return x; |
| 104 | |
| 105 | // expected high-surrogate |
| 106 | if (BOOST_UNLIKELY((x >> 10) != 0x36ul)) |
| 107 | return 0xFFFDul; |
| 108 | |
| 109 | uwchar_t y(*++s); |
| 110 | // expected low-surrogate |
| 111 | if (BOOST_UNLIKELY((y >> 10) != 0x37ul)) |
| 112 | return 0xFFFDul; |
| 113 | |
| 114 | return ((x & 0x3FFul) << 10) + (y & 0x3FFul) + 0x10000ul; |
| 115 | } |
| 116 | } |
| 117 | |
| 118 | inline utf8_string to_utf8(wchar_t const* str) |
| 119 | { |
| 120 | utf8_string result; |
| 121 | for (ucs4_char c; (c = detail::decode_utf16(str)) != ucs4_char(); ++str) |
| 122 | detail::utf8_put_encode(result, c); |
| 123 | return result; |
| 124 | } |
| 125 | |
| 126 | template <typename Traits, typename Allocator> |
| 127 | inline utf8_string |
| 128 | to_utf8(std::basic_string<wchar_t, Traits, Allocator> const& str) |
| 129 | { |
| 130 | return to_utf8(str.c_str()); |
| 131 | } |
| 132 | #endif |
| 133 | }} |
| 134 | |
| 135 | #endif |
| 136 | |