Merge remote-tracking branch 'tfussell/master'

This commit is contained in:
degaart 2015-12-23 08:53:44 +03:00
commit e5ce1fd2e6
10 changed files with 146 additions and 102 deletions

3
.gitmodules vendored
View File

@ -7,3 +7,6 @@
path = third-party/cxxtest
url = https://github.com/CxxTest/cxxtest
branch = master
[submodule "third-party/utfcpp"]
path = third-party/utfcpp
url = https://github.com/nemtrif/utfcpp

View File

@ -29,7 +29,7 @@ include_directories(../include/xlnt)
include_directories(../source)
include_directories(../third-party/miniz)
include_directories(../third-party/pugixml/src)
include_directories(../third-party/utfcpp)
include_directories(../third-party/utfcpp/source)
FILE(GLOB ROOT_HEADERS ../include/xlnt/*.hpp)
FILE(GLOB CELL_HEADERS ../include/xlnt/cell/*.hpp)

View File

@ -0,0 +1,25 @@
#include <cstdint>
#include <string>
#include <vector>
#include <utf8.h>
namespace xlnt {
class utf8string
{
public:
static utf8string from_utf8(const std::string &s);
static utf8string from_latin1(const std::string &s);
static utf8string from_utf16(const std::string &s);
static utf8string from_utf32(const std::string &s);
static bool is_valid(const std::string &s);
bool is_valid() const;
private:
std::vector<std::uint8_t> bytes_;
};
} // namespace xlnt

View File

@ -14,6 +14,7 @@
#include <xlnt/utils/time.hpp>
#include <xlnt/utils/timedelta.hpp>
#include <xlnt/utils/exceptions.hpp>
#include <xlnt/utils/utf8string.hpp>
#include <xlnt/workbook/workbook.hpp>
#include <xlnt/worksheet/column_properties.hpp>
#include <xlnt/worksheet/row_properties.hpp>
@ -61,108 +62,26 @@ std::string cell::check_string(const std::string &to_check)
break;
case encoding::utf8:
{
std::vector<std::uint8_t> bytes;
for (char c : s)
if(!utf8string::is_valid(s))
{
auto byte = static_cast<std::uint8_t>(c);
if (byte < 128)
{
if(!bytes.empty())
{
throw xlnt::unicode_decode_error(c);
}
}
else
{
if(!bytes.empty())
{
if(byte >> 6 != 2)
{
throw xlnt::unicode_decode_error(c);
}
}
}
bytes.push_back(byte);
auto first_byte = bytes[0];
auto num_bytes = 0;
if(first_byte < 128)
{
num_bytes = 1;
}
else if(first_byte >> 5 == 0b110)
{
num_bytes = 2;
}
else if(first_byte >> 4 == 0b1110)
{
num_bytes = 3;
}
else if(first_byte >> 3 == 0b11110)
{
num_bytes = 4;
}
else if(first_byte >> 2 == 0b111110)
{
num_bytes = 5;
}
else if(first_byte >> 1 == 0b1111110)
{
num_bytes = 6;
}
if(num_bytes > bytes.size())
{
throw xlnt::unicode_decode_error(c);
}
if(num_bytes == bytes.size())
{
bytes.clear();
}
throw xlnt::unicode_decode_error('0');
}
// Check last code point
if(!bytes.empty())
break;
}
case encoding::utf16:
{
if(!utf8string::from_utf16(s).is_valid())
{
auto first_byte = bytes[0];
auto num_bytes = 0;
if(first_byte < 128)
{
num_bytes = 1;
}
else if(first_byte >> 5 == 0b110)
{
num_bytes = 2;
}
else if(first_byte >> 4 == 0b1110)
{
num_bytes = 3;
}
else if(first_byte >> 3 == 0b11110)
{
num_bytes = 4;
}
else if(first_byte >> 2 == 0b111110)
{
num_bytes = 5;
}
else if(first_byte >> 1 == 0b1111110)
{
num_bytes = 6;
}
if(num_bytes > bytes.size())
{
throw xlnt::unicode_decode_error();
}
throw xlnt::unicode_decode_error('0');
}
break;
}
case encoding::utf32:
{
if(!utf8string::from_utf32(s).is_valid())
{
throw xlnt::unicode_decode_error('0');
}
break;
}
default:

View File

@ -448,4 +448,43 @@ public:
cell.set_quote_prefix(true);
TS_ASSERT(cell.quote_prefix());
}
void test_check_string()
{
xlnt::workbook utf8_wb(xlnt::encoding::utf8);
auto ws = utf8_wb.get_active_sheet();
auto cell = ws[xlnt::cell_reference("A1")];
std::vector<std::string> valid_utf8_strings =
{
"a",
"\xc3\xa0",
"\xc3\xb1",
"\xe2\x82\xa1",
"\xf0\x90\x8c\xbc",
};
for(auto valid : valid_utf8_strings)
{
TS_ASSERT_THROWS_NOTHING(cell.check_string(valid));
TS_ASSERT_THROWS_NOTHING(cell.set_value(valid));
}
std::vector<std::string> invalid_utf8_strings =
{
"\xc3\x28",
"\xa0\xa1",
"\xe2\x28\xa1",
"\xe2\x82\x28",
"\xf0\x28\x8c\xbc",
"\xf0\x90\x28\xbc",
"\xf0\x28\x8c\x28",
};
for(auto invalid : invalid_utf8_strings)
{
TS_ASSERT_THROWS(cell.check_string(invalid), xlnt::unicode_decode_error);
TS_ASSERT_THROWS(cell.set_value(invalid), xlnt::unicode_decode_error);
}
}
};

View File

@ -27,8 +27,12 @@ bool shared_strings_serializer::read_shared_strings(const xml_document &xml, std
strings.clear();
auto root_node = xml.get_child("sst");
auto unique_count = 0;
auto unique_count = std::stoull(root_node.get_attribute("uniqueCount"));
if (root_node.has_attribute("uniqueCount"))
{
unique_count = std::stoull(root_node.get_attribute("uniqueCount"));
}
for (const auto &si_node : root_node.get_children())
{

View File

@ -0,0 +1,47 @@
#include <xlnt/utils/utf8string.hpp>
namespace xlnt {
utf8string utf8string::from_utf8(const std::string &s)
{
utf8string result;
std::copy(s.begin(), s.end(), std::back_inserter(result.bytes_));
return result;
}
utf8string utf8string::from_latin1(const std::string &s)
{
utf8string result;
std::copy(s.begin(), s.end(), std::back_inserter(result.bytes_));
return result;
}
utf8string utf8string::from_utf16(const std::string &s)
{
utf8string result;
utf8::utf16to8(s.begin(), s.end(), std::back_inserter(result.bytes_));
return result;
}
utf8string utf8string::from_utf32(const std::string &s)
{
utf8string result;
utf8::utf32to8(s.begin(), s.end(), std::back_inserter(result.bytes_));
return result;
}
bool utf8string::is_valid(const std::string &s)
{
return utf8::is_valid(s.begin(), s.end());
}
bool utf8string::is_valid() const
{
return utf8::is_valid(bytes_.begin(), bytes_.end());
}
} // namespace xlnt

View File

@ -207,4 +207,10 @@ public:
TS_ASSERT_EQUALS(test_sheet.get_cell("A1").get_value<long double>(), float_value);
}
void test_read_empty_shared_strings()
{
xlnt::workbook test_ss;
TS_ASSERT_THROWS_NOTHING(test_ss.load(PathHelper::GetDataDirectory("/genuine/number_empty_shared_strings.xlsx")));
}
};

Binary file not shown.

1
third-party/utfcpp vendored Submodule

@ -0,0 +1 @@
Subproject commit f029fcc2fbc7cd979925f198f7e6ca8170d45000