diff --git a/.gitmodules b/.gitmodules index 503e0586..11d5dc74 100644 --- a/.gitmodules +++ b/.gitmodules @@ -7,3 +7,6 @@ path = third-party/cxxtest url = https://github.com/CxxTest/cxxtest branch = master +[submodule "third-party/utfcpp"] + path = third-party/utfcpp + url = https://github.com/nemtrif/utfcpp diff --git a/cmake/xlnt.cmake b/cmake/xlnt.cmake index 9615f3a5..e5e5129a 100644 --- a/cmake/xlnt.cmake +++ b/cmake/xlnt.cmake @@ -29,7 +29,7 @@ include_directories(../include/xlnt) include_directories(../source) include_directories(../third-party/miniz) include_directories(../third-party/pugixml/src) -include_directories(../third-party/utfcpp) +include_directories(../third-party/utfcpp/source) FILE(GLOB ROOT_HEADERS ../include/xlnt/*.hpp) FILE(GLOB CELL_HEADERS ../include/xlnt/cell/*.hpp) diff --git a/include/xlnt/utils/utf8string.hpp b/include/xlnt/utils/utf8string.hpp new file mode 100644 index 00000000..87601005 --- /dev/null +++ b/include/xlnt/utils/utf8string.hpp @@ -0,0 +1,25 @@ +#include +#include +#include + +#include + +namespace xlnt { + +class utf8string +{ +public: + static utf8string from_utf8(const std::string &s); + static utf8string from_latin1(const std::string &s); + static utf8string from_utf16(const std::string &s); + static utf8string from_utf32(const std::string &s); + + static bool is_valid(const std::string &s); + + bool is_valid() const; + +private: + std::vector bytes_; +}; + +} // namespace xlnt diff --git a/source/cell/cell.cpp b/source/cell/cell.cpp index 28ed03c2..668becbc 100644 --- a/source/cell/cell.cpp +++ b/source/cell/cell.cpp @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -61,108 +62,26 @@ std::string cell::check_string(const std::string &to_check) break; case encoding::utf8: { - std::vector bytes; - - for (char c : s) + if(!utf8string::is_valid(s)) { - auto byte = static_cast(c); - - if (byte < 128) - { - if(!bytes.empty()) - { - throw xlnt::unicode_decode_error(c); - } - } - else - { - if(!bytes.empty()) - { - if(byte >> 6 != 2) - { - throw xlnt::unicode_decode_error(c); - } - } - } - - bytes.push_back(byte); - - auto first_byte = bytes[0]; - auto num_bytes = 0; - - if(first_byte < 128) - { - num_bytes = 1; - } - else if(first_byte >> 5 == 0b110) - { - num_bytes = 2; - } - else if(first_byte >> 4 == 0b1110) - { - num_bytes = 3; - } - else if(first_byte >> 3 == 0b11110) - { - num_bytes = 4; - } - else if(first_byte >> 2 == 0b111110) - { - num_bytes = 5; - } - else if(first_byte >> 1 == 0b1111110) - { - num_bytes = 6; - } - - if(num_bytes > bytes.size()) - { - throw xlnt::unicode_decode_error(c); - } - - if(num_bytes == bytes.size()) - { - bytes.clear(); - } - } - - // Check last code point - if(!bytes.empty()) + throw xlnt::unicode_decode_error('0'); + } + break; + } + case encoding::utf16: + { + if(!utf8string::from_utf16(s).is_valid()) { - auto first_byte = bytes[0]; - auto num_bytes = 0; - - if(first_byte < 128) - { - num_bytes = 1; - } - else if(first_byte >> 5 == 0b110) - { - num_bytes = 2; - } - else if(first_byte >> 4 == 0b1110) - { - num_bytes = 3; - } - else if(first_byte >> 3 == 0b11110) - { - num_bytes = 4; - } - else if(first_byte >> 2 == 0b111110) - { - num_bytes = 5; - } - else if(first_byte >> 1 == 0b1111110) - { - num_bytes = 6; - } - - if(num_bytes > bytes.size()) - { - throw xlnt::unicode_decode_error(); - } - } - + throw xlnt::unicode_decode_error('0'); + } + break; + } + case encoding::utf32: + { + if(!utf8string::from_utf32(s).is_valid()) + { + throw xlnt::unicode_decode_error('0'); + } break; } default: diff --git a/source/cell/tests/test_cell.hpp b/source/cell/tests/test_cell.hpp index a3f8977c..1dc2f47b 100644 --- a/source/cell/tests/test_cell.hpp +++ b/source/cell/tests/test_cell.hpp @@ -448,4 +448,43 @@ public: cell.set_quote_prefix(true); TS_ASSERT(cell.quote_prefix()); } + + void test_check_string() + { + xlnt::workbook utf8_wb(xlnt::encoding::utf8); + auto ws = utf8_wb.get_active_sheet(); + auto cell = ws[xlnt::cell_reference("A1")]; + + std::vector valid_utf8_strings = + { + "a", + "\xc3\xa0", + "\xc3\xb1", + "\xe2\x82\xa1", + "\xf0\x90\x8c\xbc", + }; + + for(auto valid : valid_utf8_strings) + { + TS_ASSERT_THROWS_NOTHING(cell.check_string(valid)); + TS_ASSERT_THROWS_NOTHING(cell.set_value(valid)); + } + + std::vector invalid_utf8_strings = + { + "\xc3\x28", + "\xa0\xa1", + "\xe2\x28\xa1", + "\xe2\x82\x28", + "\xf0\x28\x8c\xbc", + "\xf0\x90\x28\xbc", + "\xf0\x28\x8c\x28", + }; + + for(auto invalid : invalid_utf8_strings) + { + TS_ASSERT_THROWS(cell.check_string(invalid), xlnt::unicode_decode_error); + TS_ASSERT_THROWS(cell.set_value(invalid), xlnt::unicode_decode_error); + } + } }; diff --git a/source/serialization/shared_strings_serializer.cpp b/source/serialization/shared_strings_serializer.cpp index 8a3c10bd..76cd30da 100644 --- a/source/serialization/shared_strings_serializer.cpp +++ b/source/serialization/shared_strings_serializer.cpp @@ -27,8 +27,12 @@ bool shared_strings_serializer::read_shared_strings(const xml_document &xml, std strings.clear(); auto root_node = xml.get_child("sst"); + auto unique_count = 0; - auto unique_count = std::stoull(root_node.get_attribute("uniqueCount")); + if (root_node.has_attribute("uniqueCount")) + { + unique_count = std::stoull(root_node.get_attribute("uniqueCount")); + } for (const auto &si_node : root_node.get_children()) { diff --git a/source/utils/utf8string.cpp b/source/utils/utf8string.cpp new file mode 100644 index 00000000..ad012a15 --- /dev/null +++ b/source/utils/utf8string.cpp @@ -0,0 +1,47 @@ +#include + +namespace xlnt { + +utf8string utf8string::from_utf8(const std::string &s) +{ + utf8string result; + std::copy(s.begin(), s.end(), std::back_inserter(result.bytes_)); + + return result; +} + +utf8string utf8string::from_latin1(const std::string &s) +{ + utf8string result; + std::copy(s.begin(), s.end(), std::back_inserter(result.bytes_)); + + return result; +} + +utf8string utf8string::from_utf16(const std::string &s) +{ + utf8string result; + utf8::utf16to8(s.begin(), s.end(), std::back_inserter(result.bytes_)); + + return result; +} + +utf8string utf8string::from_utf32(const std::string &s) +{ + utf8string result; + utf8::utf32to8(s.begin(), s.end(), std::back_inserter(result.bytes_)); + + return result; +} + +bool utf8string::is_valid(const std::string &s) +{ + return utf8::is_valid(s.begin(), s.end()); +} + +bool utf8string::is_valid() const +{ + return utf8::is_valid(bytes_.begin(), bytes_.end()); +} + +} // namespace xlnt diff --git a/source/workbook/tests/test_workbook.hpp b/source/workbook/tests/test_workbook.hpp index e07b0901..6cb91c03 100644 --- a/source/workbook/tests/test_workbook.hpp +++ b/source/workbook/tests/test_workbook.hpp @@ -207,4 +207,10 @@ public: TS_ASSERT_EQUALS(test_sheet.get_cell("A1").get_value(), float_value); } + + void test_read_empty_shared_strings() + { + xlnt::workbook test_ss; + TS_ASSERT_THROWS_NOTHING(test_ss.load(PathHelper::GetDataDirectory("/genuine/number_empty_shared_strings.xlsx"))); + } }; diff --git a/tests/data/genuine/number_empty_shared_strings.xlsx b/tests/data/genuine/number_empty_shared_strings.xlsx new file mode 100644 index 00000000..4caadb75 Binary files /dev/null and b/tests/data/genuine/number_empty_shared_strings.xlsx differ diff --git a/third-party/utfcpp b/third-party/utfcpp new file mode 160000 index 00000000..f029fcc2 --- /dev/null +++ b/third-party/utfcpp @@ -0,0 +1 @@ +Subproject commit f029fcc2fbc7cd979925f198f7e6ca8170d45000