properly specify transcoding behavior

2024-03-22 13:10:44 +08:00 · 2018-02-24 18:50:23 -05:00 · 2018-02-24 18:50:23 -05:00 · 14378e7126
commit 14378e7126
parent 22c41d9482
7 changed files with 170 additions and 66 deletions
--- a/docs/source/codecvt.rst
+++ b/docs/source/codecvt.rst
@ -7,7 +7,7 @@ because this is surprisingly hard using standard C++

 	The ``<codecvt>`` header is no longer used and sol2 now converts utf8, utf16, and utf32 with internal routines. If you have a problem with the transcoding, please `file an issue report`_.

-``std::(w)string(u16/u32)`` are assumed to be in the platform's native wide (for ``wstring``) or unicode format. Lua canonically stores its string literals as utf8 and embraces utf8, albeit its storage is simply a sequence of bytes that are also null-terminated (it is also counted and the size is kept around, so embedded nulls can be used in the string). Therefore, if you need to interact with the unicode or wide alternatives of strings, runtime conversions are performed from the (assumed) utf8 string data into other forms. These conversions check for well-formed UTF, and will error if they are not when converting.
+``std::(w)string(u16/u32)`` are assumed to be in the platform's native wide (for ``wstring``) or unicode format. Lua canonically stores its string literals as utf8 and embraces utf8, albeit its storage is simply a sequence of bytes that are also null-terminated (it is also counted and the size is kept around, so embedded nulls can be used in the string). Therefore, if you need to interact with the unicode or wide alternatives of strings, runtime conversions are performed from the (assumed) utf8 string data into other forms. These conversions check for well-formed UTF, and will replace ill-formed characters with the unicode replacement codepoint, 0xFFFD.

 Note that we cannot give you ``string_view``s to utf16 or utf32 strings: Lua does not hold them in memory this way. You can perhaps do your own customization to provide for this if need be. Remember that Lua stores a counted sequence of bytes: serializing your string as bytes and pushing a string type into Lua's stack will work, though do not except any complex string routines or printing to behave nicely with your code.

--- a/docs/source/functions.rst
+++ b/docs/source/functions.rst
@ -29,17 +29,7 @@ There are a number of examples dealing with functions and how they can be bound
 working with callables/lambdas
 ------------------------------

-To be explicit about wanting a struct to be interpreted as a function, use ``mytable.set_function( key, func_value );``. You can be explicit about wanting a function as well by using the :doc:`sol::as_function<../api/as_function>` call, which will wrap and identify your type as a function.
-
-.. note::
-
-	As of sol 2.18.1, the below 
-
-.. note::
-
-	Function objects ``obj`` -- a struct with a ``return_type operator()( ... )`` member defined on them, like all C++ lambdas -- are not interpreted as functions when you use ``set`` for ``mytable.set( key, value )`` and ``state.create_table(_with)( ... )``. This only happens automagically with ``mytable[key] = obj``.
-
-	Note that this also applies to calling functions, for example: ``my_state["table"]["sort"]( some_table, sorting_object );``.
+To be explicit about wanting a struct to be interpreted as a function, use ``mytable.set_function( key, func_value );``. You can also use the :doc:`sol::as_function<../api/as_function>` call, which will wrap and identify your type as a function.

 Furthermore, it is important to know that lambdas without a specified return type (and a non-const, non-reference-qualified ``auto``) will decay return values. To capture or return references explicitly, use ``decltype(auto)`` or specify the return type **exactly** as desired:

--- a/single/sol/sol.hpp
+++ b/single/sol/sol.hpp
@ -20,8 +20,8 @@
 // CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

 // This file was generated with a script.
-// Generated 2018-02-23 21:59:31.750406 UTC
-// This header was generated with sol v2.19.4 (revision b60132e)
+// Generated 2018-02-24 23:50:10.791344 UTC
+// This header was generated with sol v2.19.4 (revision 22c41d9)
 // https://github.com/ThePhD/sol2

 #ifndef SOL_SINGLE_INCLUDE_HPP
@ -8296,7 +8296,7 @@ namespace sol {
 	// Everything here was lifted pretty much straight out of
 	// ogonek, because fuck figuring it out=
 	namespace unicode {
-		enum error_code {
+		enum class error_code {
 			ok = 0,
 			invalid_code_point,
 			invalid_code_unit,
@ -8388,6 +8388,7 @@ namespace sol {
 			static constexpr int lead_surrogate_bitmask = 0xFFC00;
 			static constexpr int trail_surrogate_bitmask = 0x3FF;
 			static constexpr int lead_shifted_bits = 10;
+			static constexpr char32_t replacement = 0xFFFD;

 			static char32_t combine_surrogates(char16_t lead, char16_t trail) {
 				auto hi = lead - first_lead_surrogate;
@ -9022,6 +9023,24 @@ namespace stack {

 	template <typename Traits, typename Al>
 	struct getter<std::basic_string<char16_t, Traits, Al>> {
+		template <typename F>
+		static void convert(const char* strb, const char* stre, F&& f) {
+			char32_t cp = 0;
+			for (const char* strtarget = strb; strtarget < stre;) {
+				auto dr = unicode::utf8_to_code_point(strtarget, stre);
+				if (dr.error != unicode::error_code::ok) {
+					cp = unicode::unicode_detail::replacement;
+					++strtarget;
+				}
+				else {
+					cp = dr.codepoint;
+					strtarget = dr.next;
+				}
+				auto er = unicode::code_point_to_utf16(cp);
+				f(er);
+			}
+		}
+
 		template <typename S>
 		static S get_into(lua_State* L, int index, record& tracking) {
 			typedef typename S::value_type Ch;
@ -9033,22 +9052,18 @@ namespace stack {
 			std::size_t needed_size = 0;
 			const char* strb = utf8p;
 			const char* stre = utf8p + len;
-			for (const char* strtarget = strb; strtarget < stre;) {
-				auto dr = unicode::utf8_to_code_point(strtarget, stre);
-				auto er = unicode::code_point_to_utf16(dr.codepoint);
+			auto count_units = [&needed_size](const unicode::encoded_result<char16_t> er) {
 				needed_size += er.code_units_size;
-				strtarget = dr.next;
-			}
+			};
+			convert(strb, stre, count_units);
 			S r(needed_size, static_cast<Ch>(0));
 			r.resize(needed_size);
 			Ch* target = &r[0];
-			for (const char* strtarget = strb; strtarget < stre;) {
-				auto dr = unicode::utf8_to_code_point(strtarget, stre);
-				auto er = unicode::code_point_to_utf16(dr.codepoint);
+			auto copy_units = [&target](const unicode::encoded_result<char16_t> er) {
 				std::memcpy(target, er.code_units.data(), er.code_units_size * sizeof(Ch));
-				strtarget = dr.next;
 				target += er.code_units_size;
-			}
+			};
+			convert(strb, stre, copy_units);
 			return r;
 		}

@ -9059,6 +9074,24 @@ namespace stack {

 	template <typename Traits, typename Al>
 	struct getter<std::basic_string<char32_t, Traits, Al>> {
+		template <typename F>
+		static void convert(const char* strb, const char* stre, F&& f) {
+			char32_t cp = 0;
+			for (const char* strtarget = strb; strtarget < stre;) {
+				auto dr = unicode::utf8_to_code_point(strtarget, stre);
+				if (dr.error != unicode::error_code::ok) {
+					cp = unicode::unicode_detail::replacement;
+					++strtarget;
+				}
+				else {
+					cp = dr.codepoint;
+					strtarget = dr.next;
+				}
+				auto er = unicode::code_point_to_utf32(cp);
+				f(er);
+			}
+		}
+
 		template <typename S>
 		static S get_into(lua_State* L, int index, record& tracking) {
 			typedef typename S::value_type Ch;
@ -9070,22 +9103,18 @@ namespace stack {
 			std::size_t needed_size = 0;
 			const char* strb = utf8p;
 			const char* stre = utf8p + len;
-			for (const char* strtarget = strb; strtarget < stre;) {
-				auto dr = unicode::utf8_to_code_point(strtarget, stre);
-				auto er = unicode::code_point_to_utf32(dr.codepoint);
+			auto count_units = [&needed_size](const unicode::encoded_result<char32_t> er) {
 				needed_size += er.code_units_size;
-				strtarget = dr.next;
-			}
+			};
+			convert(strb, stre, count_units);
 			S r(needed_size, static_cast<Ch>(0));
 			r.resize(needed_size);
 			Ch* target = &r[0];
-			for (const char* strtarget = strb; strtarget < stre;) {
-				auto dr = unicode::utf8_to_code_point(strtarget, stre);
-				auto er = unicode::code_point_to_utf32(dr.codepoint);
+			auto copy_units = [&target](const unicode::encoded_result<char32_t> er) {
 				std::memcpy(target, er.code_units.data(), er.code_units_size * sizeof(Ch));
-				strtarget = dr.next;
 				target += er.code_units_size;
-			}
+			};
+			convert(strb, stre, copy_units);
 			return r;
 		}

@ -9100,8 +9129,15 @@ namespace stack {
 			string_view utf8 = stack::get<string_view>(L, index, tracking);
 			const char* strb = utf8.data();
 			const char* stre = utf8.data() + utf8.size();
+			char32_t cp = 0;
 			auto dr = unicode::utf8_to_code_point(strb, stre);
-			auto er = unicode::code_point_to_utf16(dr.codepoint);
+			if (dr.error != unicode::error_code::ok) {
+				cp = unicode::unicode_detail::replacement;
+			}
+			else {
+				cp = dr.codepoint;
+			}
+			auto er = unicode::code_point_to_utf16(cp);
 			return er.code_units[0];
 		}
 	};
@ -9112,8 +9148,15 @@ namespace stack {
 			string_view utf8 = stack::get<string_view>(L, index, tracking);
 			const char* strb = utf8.data();
 			const char* stre = utf8.data() + utf8.size();
+			char32_t cp = 0;
 			auto dr = unicode::utf8_to_code_point(strb, stre);
-			auto er = unicode::code_point_to_utf32(dr.codepoint);
+			if (dr.error != unicode::error_code::ok) {
+				cp = unicode::unicode_detail::replacement;
+			}
+			else {
+				cp = dr.codepoint;
+			}
+			auto er = unicode::code_point_to_utf32(cp);
 			return er.code_units[0];
 		}
 	};
@ -10197,9 +10240,16 @@ namespace stack {
 	struct pusher<const char16_t*> {
 		static int convert_into(lua_State* L, char* start, std::size_t, const char16_t* strb, const char16_t* stre) {
 			char* target = start;
+			char32_t cp = 0;
 			for (const char16_t* strtarget = strb; strtarget < stre;) {
 				auto dr = unicode::utf16_to_code_point(strtarget, stre);
-				auto er = unicode::code_point_to_utf8(dr.codepoint);
+				if (dr.error != unicode::error_code::ok) {
+					cp = unicode::unicode_detail::replacement;
+				}
+				else {
+					cp = dr.codepoint;
+				}
+				auto er = unicode::code_point_to_utf8(cp);
 				const char* utf8data = er.code_units.data();
 				std::memcpy(target, utf8data, er.code_units_size);
 				target += er.code_units_size;
@ -10270,9 +10320,16 @@ namespace stack {
 	struct pusher<const char32_t*> {
 		static int convert_into(lua_State* L, char* start, std::size_t, const char32_t* strb, const char32_t* stre) {
 			char* target = start;
+			char32_t cp = 0;
 			for (const char32_t* strtarget = strb; strtarget < stre;) {
 				auto dr = unicode::utf32_to_code_point(strtarget, stre);
-				auto er = unicode::code_point_to_utf8(dr.codepoint);
+				if (dr.error != unicode::error_code::ok) {
+					cp = unicode::unicode_detail::replacement;
+				}
+				else {
+					cp = dr.codepoint;
+				}
+				auto er = unicode::code_point_to_utf8(cp);
 				const char* data = er.code_units.data();
 				std::memcpy(target, data, er.code_units_size);
 				target += er.code_units_size;
--- a/single/sol/sol_forward.hpp
+++ b/single/sol/sol_forward.hpp
@ -20,8 +20,8 @@
 // CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

 // This file was generated with a script.
-// Generated 2018-02-23 21:59:32.038059 UTC
-// This header was generated with sol v2.19.4 (revision b60132e)
+// Generated 2018-02-24 23:50:11.000275 UTC
+// This header was generated with sol v2.19.4 (revision 22c41d9)
 // https://github.com/ThePhD/sol2

 #ifndef SOL_SINGLE_INCLUDE_FORWARD_HPP
--- a/sol/stack_get.hpp
+++ b/sol/stack_get.hpp
@ -467,6 +467,24 @@ namespace stack {

 	template <typename Traits, typename Al>
 	struct getter<std::basic_string<char16_t, Traits, Al>> {
+		template <typename F>
+		static void convert(const char* strb, const char* stre, F&& f) {
+			char32_t cp = 0;
+			for (const char* strtarget = strb; strtarget < stre;) {
+				auto dr = unicode::utf8_to_code_point(strtarget, stre);
+				if (dr.error != unicode::error_code::ok) {
+					cp = unicode::unicode_detail::replacement;
+					++strtarget;
+				}
+				else {
+					cp = dr.codepoint;
+					strtarget = dr.next;
+				}
+				auto er = unicode::code_point_to_utf16(cp);
+				f(er);
+			}
+		}
+
 		template <typename S>
 		static S get_into(lua_State* L, int index, record& tracking) {
 			typedef typename S::value_type Ch;
@ -478,22 +496,18 @@ namespace stack {
 			std::size_t needed_size = 0;
 			const char* strb = utf8p;
 			const char* stre = utf8p + len;
-			for (const char* strtarget = strb; strtarget < stre;) {
-				auto dr = unicode::utf8_to_code_point(strtarget, stre);
-				auto er = unicode::code_point_to_utf16(dr.codepoint);
+			auto count_units = [&needed_size](const unicode::encoded_result<char16_t> er) {
 				needed_size += er.code_units_size;
-				strtarget = dr.next;
-			}
+			};
+			convert(strb, stre, count_units);
 			S r(needed_size, static_cast<Ch>(0));
 			r.resize(needed_size);
 			Ch* target = &r[0];
-			for (const char* strtarget = strb; strtarget < stre;) {
-				auto dr = unicode::utf8_to_code_point(strtarget, stre);
-				auto er = unicode::code_point_to_utf16(dr.codepoint);
+			auto copy_units = [&target](const unicode::encoded_result<char16_t> er) {
 				std::memcpy(target, er.code_units.data(), er.code_units_size * sizeof(Ch));
-				strtarget = dr.next;
 				target += er.code_units_size;
-			}
+			};
+			convert(strb, stre, copy_units);
 			return r;
 		}

@ -504,6 +518,24 @@ namespace stack {

 	template <typename Traits, typename Al>
 	struct getter<std::basic_string<char32_t, Traits, Al>> {
+		template <typename F>
+		static void convert(const char* strb, const char* stre, F&& f) {
+			char32_t cp = 0;
+			for (const char* strtarget = strb; strtarget < stre;) {
+				auto dr = unicode::utf8_to_code_point(strtarget, stre);
+				if (dr.error != unicode::error_code::ok) {
+					cp = unicode::unicode_detail::replacement;
+					++strtarget;
+				}
+				else {
+					cp = dr.codepoint;
+					strtarget = dr.next;
+				}
+				auto er = unicode::code_point_to_utf32(cp);
+				f(er);
+			}
+		}
+
 		template <typename S>
 		static S get_into(lua_State* L, int index, record& tracking) {
 			typedef typename S::value_type Ch;
@ -515,22 +547,18 @@ namespace stack {
 			std::size_t needed_size = 0;
 			const char* strb = utf8p;
 			const char* stre = utf8p + len;
-			for (const char* strtarget = strb; strtarget < stre;) {
-				auto dr = unicode::utf8_to_code_point(strtarget, stre);
-				auto er = unicode::code_point_to_utf32(dr.codepoint);
+			auto count_units = [&needed_size](const unicode::encoded_result<char32_t> er) {
 				needed_size += er.code_units_size;
-				strtarget = dr.next;
-			}
+			};
+			convert(strb, stre, count_units);
 			S r(needed_size, static_cast<Ch>(0));
 			r.resize(needed_size);
 			Ch* target = &r[0];
-			for (const char* strtarget = strb; strtarget < stre;) {
-				auto dr = unicode::utf8_to_code_point(strtarget, stre);
-				auto er = unicode::code_point_to_utf32(dr.codepoint);
+			auto copy_units = [&target](const unicode::encoded_result<char32_t> er) {
 				std::memcpy(target, er.code_units.data(), er.code_units_size * sizeof(Ch));
-				strtarget = dr.next;
 				target += er.code_units_size;
-			}
+			};
+			convert(strb, stre, copy_units);
 			return r;
 		}

@ -545,8 +573,15 @@ namespace stack {
 			string_view utf8 = stack::get<string_view>(L, index, tracking);
 			const char* strb = utf8.data();
 			const char* stre = utf8.data() + utf8.size();
+			char32_t cp = 0;
 			auto dr = unicode::utf8_to_code_point(strb, stre);
-			auto er = unicode::code_point_to_utf16(dr.codepoint);
+			if (dr.error != unicode::error_code::ok) {
+				cp = unicode::unicode_detail::replacement;
+			}
+			else {
+				cp = dr.codepoint;
+			}
+			auto er = unicode::code_point_to_utf16(cp);
 			return er.code_units[0];
 		}
 	};
@ -557,8 +592,15 @@ namespace stack {
 			string_view utf8 = stack::get<string_view>(L, index, tracking);
 			const char* strb = utf8.data();
 			const char* stre = utf8.data() + utf8.size();
+			char32_t cp = 0;
 			auto dr = unicode::utf8_to_code_point(strb, stre);
-			auto er = unicode::code_point_to_utf32(dr.codepoint);
+			if (dr.error != unicode::error_code::ok) {
+				cp = unicode::unicode_detail::replacement;
+			}
+			else {
+				cp = dr.codepoint;
+			}
+			auto er = unicode::code_point_to_utf32(cp);
 			return er.code_units[0];
 		}
 	};
--- a/sol/stack_push.hpp
+++ b/sol/stack_push.hpp
@ -692,9 +692,16 @@ namespace stack {
 	struct pusher<const char16_t*> {
 		static int convert_into(lua_State* L, char* start, std::size_t, const char16_t* strb, const char16_t* stre) {
 			char* target = start;
+			char32_t cp = 0;
 			for (const char16_t* strtarget = strb; strtarget < stre;) {
 				auto dr = unicode::utf16_to_code_point(strtarget, stre);
-				auto er = unicode::code_point_to_utf8(dr.codepoint);
+				if (dr.error != unicode::error_code::ok) {
+					cp = unicode::unicode_detail::replacement;
+				}
+				else {
+					cp = dr.codepoint;
+				}
+				auto er = unicode::code_point_to_utf8(cp);
 				const char* utf8data = er.code_units.data();
 				std::memcpy(target, utf8data, er.code_units_size);
 				target += er.code_units_size;
@ -765,9 +772,16 @@ namespace stack {
 	struct pusher<const char32_t*> {
 		static int convert_into(lua_State* L, char* start, std::size_t, const char32_t* strb, const char32_t* stre) {
 			char* target = start;
+			char32_t cp = 0;
 			for (const char32_t* strtarget = strb; strtarget < stre;) {
 				auto dr = unicode::utf32_to_code_point(strtarget, stre);
-				auto er = unicode::code_point_to_utf8(dr.codepoint);
+				if (dr.error != unicode::error_code::ok) {
+					cp = unicode::unicode_detail::replacement;
+				}
+				else {
+					cp = dr.codepoint;
+				}
+				auto er = unicode::code_point_to_utf8(cp);
 				const char* data = er.code_units.data();
 				std::memcpy(target, data, er.code_units_size);
 				target += er.code_units_size;
--- a/sol/unicode.hpp
+++ b/sol/unicode.hpp
@ -8,7 +8,7 @@ namespace sol {
 	// Everything here was lifted pretty much straight out of
 	// ogonek, because fuck figuring it out=
 	namespace unicode {
-		enum error_code {
+		enum class error_code {
 			ok = 0,
 			invalid_code_point,
 			invalid_code_unit,
@ -100,6 +100,7 @@ namespace sol {
 			static constexpr int lead_surrogate_bitmask = 0xFFC00;
 			static constexpr int trail_surrogate_bitmask = 0x3FF;
 			static constexpr int lead_shifted_bits = 10;
+			static constexpr char32_t replacement = 0xFFFD;

 			static char32_t combine_surrogates(char16_t lead, char16_t trail) {
 				auto hi = lead - first_lead_surrogate;