fix #35 by using utfcpp for utf8 string validation

This commit is contained in:
Thomas Fussell 2015-12-22 14:23:47 -05:00
parent 31c903589f
commit 26d0ace151
5 changed files with 131 additions and 101 deletions

View File

@ -29,7 +29,7 @@ include_directories(../include/xlnt)
include_directories(../source)
include_directories(../third-party/miniz)
include_directories(../third-party/pugixml/src)
include_directories(../third-party/utfcpp)
include_directories(../third-party/utfcpp/source)
FILE(GLOB ROOT_HEADERS ../include/xlnt/*.hpp)
FILE(GLOB CELL_HEADERS ../include/xlnt/cell/*.hpp)

View File

@ -0,0 +1,25 @@
#include <cstdint>
#include <string>
#include <vector>
#include <utf8.h>
namespace xlnt {
class utf8string
{
public:
static utf8string from_utf8(const std::string &s);
static utf8string from_latin1(const std::string &s);
static utf8string from_utf16(const std::string &s);
static utf8string from_utf32(const std::string &s);
static bool is_valid(const std::string &s);
bool is_valid() const;
private:
std::vector<std::uint8_t> bytes_;
};
} // namespace xlnt

View File

@ -14,6 +14,7 @@
#include <xlnt/utils/time.hpp>
#include <xlnt/utils/timedelta.hpp>
#include <xlnt/utils/exceptions.hpp>
#include <xlnt/utils/utf8string.hpp>
#include <xlnt/workbook/workbook.hpp>
#include <xlnt/worksheet/column_properties.hpp>
#include <xlnt/worksheet/row_properties.hpp>
@ -61,108 +62,26 @@ std::string cell::check_string(const std::string &to_check)
break;
case encoding::utf8:
{
std::vector<std::uint8_t> bytes;
for (char c : s)
if(!utf8string::is_valid(s))
{
auto byte = static_cast<std::uint8_t>(c);
if (byte < 128)
{
if(!bytes.empty())
{
throw xlnt::unicode_decode_error(c);
}
}
else
{
if(!bytes.empty())
{
if(byte >> 6 != 2)
{
throw xlnt::unicode_decode_error(c);
}
}
}
bytes.push_back(byte);
auto first_byte = bytes[0];
auto num_bytes = 0;
if(first_byte < 128)
{
num_bytes = 1;
}
else if(first_byte >> 5 == 0b110)
{
num_bytes = 2;
}
else if(first_byte >> 4 == 0b1110)
{
num_bytes = 3;
}
else if(first_byte >> 3 == 0b11110)
{
num_bytes = 4;
}
else if(first_byte >> 2 == 0b111110)
{
num_bytes = 5;
}
else if(first_byte >> 1 == 0b1111110)
{
num_bytes = 6;
}
if(num_bytes > bytes.size())
{
throw xlnt::unicode_decode_error(c);
}
if(num_bytes == bytes.size())
{
bytes.clear();
}
throw xlnt::unicode_decode_error('0');
}
// Check last code point
if(!bytes.empty())
break;
}
case encoding::utf16:
{
if(!utf8string::from_utf16(s).is_valid())
{
auto first_byte = bytes[0];
auto num_bytes = 0;
if(first_byte < 128)
{
num_bytes = 1;
}
else if(first_byte >> 5 == 0b110)
{
num_bytes = 2;
}
else if(first_byte >> 4 == 0b1110)
{
num_bytes = 3;
}
else if(first_byte >> 3 == 0b11110)
{
num_bytes = 4;
}
else if(first_byte >> 2 == 0b111110)
{
num_bytes = 5;
}
else if(first_byte >> 1 == 0b1111110)
{
num_bytes = 6;
}
if(num_bytes > bytes.size())
{
throw xlnt::unicode_decode_error();
}
throw xlnt::unicode_decode_error('0');
}
break;
}
case encoding::utf32:
{
if(!utf8string::from_utf32(s).is_valid())
{
throw xlnt::unicode_decode_error('0');
}
break;
}
default:

View File

@ -448,4 +448,43 @@ public:
cell.set_quote_prefix(true);
TS_ASSERT(cell.quote_prefix());
}
void test_check_string()
{
xlnt::workbook utf8_wb(xlnt::encoding::utf8);
auto ws = utf8_wb.get_active_sheet();
auto cell = ws[xlnt::cell_reference("A1")];
std::vector<std::string> valid_utf8_strings =
{
"a",
"\xc3\xa0",
"\xc3\xb1",
"\xe2\x82\xa1",
"\xf0\x90\x8c\xbc",
};
for(auto valid : valid_utf8_strings)
{
TS_ASSERT_THROWS_NOTHING(cell.check_string(valid));
TS_ASSERT_THROWS_NOTHING(cell.set_value(valid));
}
std::vector<std::string> invalid_utf8_strings =
{
"\xc3\x28",
"\xa0\xa1",
"\xe2\x28\xa1",
"\xe2\x82\x28",
"\xf0\x28\x8c\xbc",
"\xf0\x90\x28\xbc",
"\xf0\x28\x8c\x28",
};
for(auto invalid : invalid_utf8_strings)
{
TS_ASSERT_THROWS(cell.check_string(invalid), xlnt::unicode_decode_error);
TS_ASSERT_THROWS(cell.set_value(invalid), xlnt::unicode_decode_error);
}
}
};

View File

@ -0,0 +1,47 @@
#include <xlnt/utils/utf8string.hpp>
namespace xlnt {
utf8string utf8string::from_utf8(const std::string &s)
{
utf8string result;
std::copy(s.begin(), s.end(), std::back_inserter(result.bytes_));
return result;
}
utf8string utf8string::from_latin1(const std::string &s)
{
utf8string result;
std::copy(s.begin(), s.end(), std::back_inserter(result.bytes_));
return result;
}
utf8string utf8string::from_utf16(const std::string &s)
{
utf8string result;
utf8::utf16to8(s.begin(), s.end(), std::back_inserter(result.bytes_));
return result;
}
utf8string utf8string::from_utf32(const std::string &s)
{
utf8string result;
utf8::utf32to8(s.begin(), s.end(), std::back_inserter(result.bytes_));
return result;
}
bool utf8string::is_valid(const std::string &s)
{
return utf8::is_valid(s.begin(), s.end());
}
bool utf8string::is_valid() const
{
return utf8::is_valid(bytes_.begin(), bytes_.end());
}
} // namespace xlnt