mirror of
https://github.com/tfussell/xlnt.git
synced 2024-03-22 13:11:17 +08:00
fix #35 by using utfcpp for utf8 string validation
This commit is contained in:
parent
31c903589f
commit
26d0ace151
@ -29,7 +29,7 @@ include_directories(../include/xlnt)
|
||||
include_directories(../source)
|
||||
include_directories(../third-party/miniz)
|
||||
include_directories(../third-party/pugixml/src)
|
||||
include_directories(../third-party/utfcpp)
|
||||
include_directories(../third-party/utfcpp/source)
|
||||
|
||||
FILE(GLOB ROOT_HEADERS ../include/xlnt/*.hpp)
|
||||
FILE(GLOB CELL_HEADERS ../include/xlnt/cell/*.hpp)
|
||||
|
25
include/xlnt/utils/utf8string.hpp
Normal file
25
include/xlnt/utils/utf8string.hpp
Normal file
@ -0,0 +1,25 @@
|
||||
#include <cstdint>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include <utf8.h>
|
||||
|
||||
namespace xlnt {
|
||||
|
||||
class utf8string
|
||||
{
|
||||
public:
|
||||
static utf8string from_utf8(const std::string &s);
|
||||
static utf8string from_latin1(const std::string &s);
|
||||
static utf8string from_utf16(const std::string &s);
|
||||
static utf8string from_utf32(const std::string &s);
|
||||
|
||||
static bool is_valid(const std::string &s);
|
||||
|
||||
bool is_valid() const;
|
||||
|
||||
private:
|
||||
std::vector<std::uint8_t> bytes_;
|
||||
};
|
||||
|
||||
} // namespace xlnt
|
@ -14,6 +14,7 @@
|
||||
#include <xlnt/utils/time.hpp>
|
||||
#include <xlnt/utils/timedelta.hpp>
|
||||
#include <xlnt/utils/exceptions.hpp>
|
||||
#include <xlnt/utils/utf8string.hpp>
|
||||
#include <xlnt/workbook/workbook.hpp>
|
||||
#include <xlnt/worksheet/column_properties.hpp>
|
||||
#include <xlnt/worksheet/row_properties.hpp>
|
||||
@ -61,108 +62,26 @@ std::string cell::check_string(const std::string &to_check)
|
||||
break;
|
||||
case encoding::utf8:
|
||||
{
|
||||
std::vector<std::uint8_t> bytes;
|
||||
|
||||
for (char c : s)
|
||||
if(!utf8string::is_valid(s))
|
||||
{
|
||||
auto byte = static_cast<std::uint8_t>(c);
|
||||
|
||||
if (byte < 128)
|
||||
{
|
||||
if(!bytes.empty())
|
||||
{
|
||||
throw xlnt::unicode_decode_error(c);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if(!bytes.empty())
|
||||
{
|
||||
if(byte >> 6 != 2)
|
||||
{
|
||||
throw xlnt::unicode_decode_error(c);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bytes.push_back(byte);
|
||||
|
||||
auto first_byte = bytes[0];
|
||||
auto num_bytes = 0;
|
||||
|
||||
if(first_byte < 128)
|
||||
{
|
||||
num_bytes = 1;
|
||||
}
|
||||
else if(first_byte >> 5 == 0b110)
|
||||
{
|
||||
num_bytes = 2;
|
||||
}
|
||||
else if(first_byte >> 4 == 0b1110)
|
||||
{
|
||||
num_bytes = 3;
|
||||
}
|
||||
else if(first_byte >> 3 == 0b11110)
|
||||
{
|
||||
num_bytes = 4;
|
||||
}
|
||||
else if(first_byte >> 2 == 0b111110)
|
||||
{
|
||||
num_bytes = 5;
|
||||
}
|
||||
else if(first_byte >> 1 == 0b1111110)
|
||||
{
|
||||
num_bytes = 6;
|
||||
}
|
||||
|
||||
if(num_bytes > bytes.size())
|
||||
{
|
||||
throw xlnt::unicode_decode_error(c);
|
||||
}
|
||||
|
||||
if(num_bytes == bytes.size())
|
||||
{
|
||||
bytes.clear();
|
||||
}
|
||||
}
|
||||
|
||||
// Check last code point
|
||||
if(!bytes.empty())
|
||||
throw xlnt::unicode_decode_error('0');
|
||||
}
|
||||
break;
|
||||
}
|
||||
case encoding::utf16:
|
||||
{
|
||||
if(!utf8string::from_utf16(s).is_valid())
|
||||
{
|
||||
auto first_byte = bytes[0];
|
||||
auto num_bytes = 0;
|
||||
|
||||
if(first_byte < 128)
|
||||
{
|
||||
num_bytes = 1;
|
||||
}
|
||||
else if(first_byte >> 5 == 0b110)
|
||||
{
|
||||
num_bytes = 2;
|
||||
}
|
||||
else if(first_byte >> 4 == 0b1110)
|
||||
{
|
||||
num_bytes = 3;
|
||||
}
|
||||
else if(first_byte >> 3 == 0b11110)
|
||||
{
|
||||
num_bytes = 4;
|
||||
}
|
||||
else if(first_byte >> 2 == 0b111110)
|
||||
{
|
||||
num_bytes = 5;
|
||||
}
|
||||
else if(first_byte >> 1 == 0b1111110)
|
||||
{
|
||||
num_bytes = 6;
|
||||
}
|
||||
|
||||
if(num_bytes > bytes.size())
|
||||
{
|
||||
throw xlnt::unicode_decode_error();
|
||||
}
|
||||
}
|
||||
|
||||
throw xlnt::unicode_decode_error('0');
|
||||
}
|
||||
break;
|
||||
}
|
||||
case encoding::utf32:
|
||||
{
|
||||
if(!utf8string::from_utf32(s).is_valid())
|
||||
{
|
||||
throw xlnt::unicode_decode_error('0');
|
||||
}
|
||||
break;
|
||||
}
|
||||
default:
|
||||
|
@ -448,4 +448,43 @@ public:
|
||||
cell.set_quote_prefix(true);
|
||||
TS_ASSERT(cell.quote_prefix());
|
||||
}
|
||||
|
||||
void test_check_string()
|
||||
{
|
||||
xlnt::workbook utf8_wb(xlnt::encoding::utf8);
|
||||
auto ws = utf8_wb.get_active_sheet();
|
||||
auto cell = ws[xlnt::cell_reference("A1")];
|
||||
|
||||
std::vector<std::string> valid_utf8_strings =
|
||||
{
|
||||
"a",
|
||||
"\xc3\xa0",
|
||||
"\xc3\xb1",
|
||||
"\xe2\x82\xa1",
|
||||
"\xf0\x90\x8c\xbc",
|
||||
};
|
||||
|
||||
for(auto valid : valid_utf8_strings)
|
||||
{
|
||||
TS_ASSERT_THROWS_NOTHING(cell.check_string(valid));
|
||||
TS_ASSERT_THROWS_NOTHING(cell.set_value(valid));
|
||||
}
|
||||
|
||||
std::vector<std::string> invalid_utf8_strings =
|
||||
{
|
||||
"\xc3\x28",
|
||||
"\xa0\xa1",
|
||||
"\xe2\x28\xa1",
|
||||
"\xe2\x82\x28",
|
||||
"\xf0\x28\x8c\xbc",
|
||||
"\xf0\x90\x28\xbc",
|
||||
"\xf0\x28\x8c\x28",
|
||||
};
|
||||
|
||||
for(auto invalid : invalid_utf8_strings)
|
||||
{
|
||||
TS_ASSERT_THROWS(cell.check_string(invalid), xlnt::unicode_decode_error);
|
||||
TS_ASSERT_THROWS(cell.set_value(invalid), xlnt::unicode_decode_error);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
47
source/utils/utf8string.cpp
Normal file
47
source/utils/utf8string.cpp
Normal file
@ -0,0 +1,47 @@
|
||||
#include <xlnt/utils/utf8string.hpp>
|
||||
|
||||
namespace xlnt {
|
||||
|
||||
utf8string utf8string::from_utf8(const std::string &s)
|
||||
{
|
||||
utf8string result;
|
||||
std::copy(s.begin(), s.end(), std::back_inserter(result.bytes_));
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
utf8string utf8string::from_latin1(const std::string &s)
|
||||
{
|
||||
utf8string result;
|
||||
std::copy(s.begin(), s.end(), std::back_inserter(result.bytes_));
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
utf8string utf8string::from_utf16(const std::string &s)
|
||||
{
|
||||
utf8string result;
|
||||
utf8::utf16to8(s.begin(), s.end(), std::back_inserter(result.bytes_));
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
utf8string utf8string::from_utf32(const std::string &s)
|
||||
{
|
||||
utf8string result;
|
||||
utf8::utf32to8(s.begin(), s.end(), std::back_inserter(result.bytes_));
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
bool utf8string::is_valid(const std::string &s)
|
||||
{
|
||||
return utf8::is_valid(s.begin(), s.end());
|
||||
}
|
||||
|
||||
bool utf8string::is_valid() const
|
||||
{
|
||||
return utf8::is_valid(bytes_.begin(), bytes_.end());
|
||||
}
|
||||
|
||||
} // namespace xlnt
|
Loading…
x
Reference in New Issue
Block a user