optimize shared strings and handle formula strings correctly

This commit is contained in:
Thomas Fussell 2017-05-10 08:44:25 -04:00
parent b851d1c143
commit d2be054b7c
10 changed files with 221 additions and 203 deletions

View File

@ -37,18 +37,22 @@ namespace xlnt {
/// </summary>
enum class XLNT_API cell_type
{
/// no value. note: this is different from an empty string value or 0 numeric value
null,
/// number
numeric,
/// string
string,
/// value is a formula
formula,
/// no value
empty,
/// value is TRUE or FALSE
boolean,
/// value is an ISO 8601 formatted date
date,
/// value is a known error code such as \#VALUE!
error,
/// value is TRUE or FALSE
boolean
/// value is a string stored in the cell
inline_string,
/// value is a number
number,
/// value is a string shared with other cells to save space
shared_string,
/// value is the string result of a formula
formula_string
};
} // namespace xlnt

View File

@ -699,9 +699,9 @@ public:
/// Append a shared string to the shared string collection in this workbook.
/// This should not generally be called unless you know what you're doing.
/// If allow_duplicates is false and the string is already in the collection,
/// it will not be added.
/// it will not be added. Returns the index of the added string.
/// </summary>
void add_shared_string(const rich_text &shared, bool allow_duplicates = false);
std::size_t add_shared_string(const rich_text &shared, bool allow_duplicates = false);
/// <summary>
/// Returns a reference to the shared strings being used by cells

View File

@ -194,12 +194,12 @@ cell::cell(detail::cell_impl *d)
bool cell::garbage_collectible() const
{
return !(data_type() != type::null || is_merged() || has_formula() || has_format());
return !(has_value() || is_merged() || has_formula() || has_format());
}
void cell::value(std::nullptr_t)
{
d_->type_ = type::null;
clear_value();
}
void cell::value(bool boolean_value)
@ -211,82 +211,56 @@ void cell::value(bool boolean_value)
void cell::value(int int_value)
{
d_->value_numeric_ = static_cast<long double>(int_value);
d_->type_ = type::numeric;
d_->type_ = type::number;
}
void cell::value(unsigned int int_value)
{
d_->value_numeric_ = static_cast<long double>(int_value);
d_->type_ = type::numeric;
d_->type_ = type::number;
}
void cell::value(long long int int_value)
{
d_->value_numeric_ = static_cast<long double>(int_value);
d_->type_ = type::numeric;
d_->type_ = type::number;
}
void cell::value(unsigned long long int int_value)
{
d_->value_numeric_ = static_cast<long double>(int_value);
d_->type_ = type::numeric;
d_->type_ = type::number;
}
void cell::value(float float_value)
{
d_->value_numeric_ = static_cast<long double>(float_value);
d_->type_ = type::numeric;
d_->type_ = type::number;
}
void cell::value(double float_value)
{
d_->value_numeric_ = static_cast<long double>(float_value);
d_->type_ = type::numeric;
d_->type_ = type::number;
}
void cell::value(long double d)
{
d_->value_numeric_ = d;
d_->type_ = type::numeric;
d_->type_ = type::number;
}
void cell::value(const std::string &s)
{
auto checked = check_string(s);
if (checked.size() > 1 && checked.front() == '=')
{
d_->type_ = type::formula;
formula(checked);
}
else if (cell::error_codes().find(checked) != cell::error_codes().end())
{
error(checked);
}
else
{
d_->type_ = type::string;
d_->value_text_.plain_text(checked);
if (checked.size() > 0)
{
workbook().add_shared_string(d_->value_text_);
}
}
value(rich_text(check_string(s)));
}
void cell::value(const rich_text &text)
{
if (text.runs().size() == 1 && !text.runs().front().second.is_set())
{
value(text.plain_text());
}
else
{
d_->type_ = type::string;
d_->value_text_ = text;
workbook().add_shared_string(text);
}
check_string(text.plain_text());
d_->type_ = type::shared_string;
d_->value_numeric_ = static_cast<long double>(workbook().add_shared_string(text));
}
void cell::value(const char *c)
@ -306,28 +280,28 @@ void cell::value(const cell c)
void cell::value(const date &d)
{
d_->type_ = type::numeric;
d_->type_ = type::number;
d_->value_numeric_ = d.to_number(base_date());
number_format(number_format::date_yyyymmdd2());
}
void cell::value(const datetime &d)
{
d_->type_ = type::numeric;
d_->type_ = type::number;
d_->value_numeric_ = d.to_number(base_date());
number_format(number_format::date_datetime());
}
void cell::value(const time &t)
{
d_->type_ = type::numeric;
d_->type_ = type::number;
d_->value_numeric_ = t.to_number();
number_format(number_format::date_time6());
}
void cell::value(const timedelta &t)
{
d_->type_ = type::numeric;
d_->type_ = type::number;
d_->value_numeric_ = t.to_number();
number_format(xlnt::number_format("[hh]:mm:ss"));
}
@ -354,7 +328,7 @@ bool cell::is_merged() const
bool cell::is_date() const
{
return data_type() == type::numeric && has_format() && number_format().is_date_format();
return data_type() == type::number && has_format() && number_format().is_date_format();
}
cell_reference cell::reference() const
@ -429,6 +403,8 @@ void cell::formula(const std::string &formula)
{
d_->formula_ = formula;
}
data_type(type::number);
worksheet().register_calc_chain_in_manifest();
}
@ -551,7 +527,7 @@ void cell::clear_value()
{
d_->value_numeric_ = 0;
d_->value_text_.clear();
d_->type_ = cell::type::null;
d_->type_ = cell::type::empty;
clear_formula();
}
@ -666,18 +642,23 @@ void cell::protection(const class protection &protection_)
template <>
XLNT_API std::string cell::value() const
{
return d_->value_text_.plain_text();
return value<rich_text>().plain_text();
}
template <>
XLNT_API rich_text cell::value() const
{
if (data_type() == cell::type::shared_string)
{
return workbook().shared_strings().at(static_cast<std::size_t>(d_->value_numeric_));
}
return d_->value_text_;
}
bool cell::has_value() const
{
return d_->type_ != cell::type::null;
return d_->type_ != cell::type::empty;
}
std::string cell::to_string() const
@ -686,12 +667,13 @@ std::string cell::to_string() const
switch (data_type())
{
case cell::type::null:
case cell::type::empty:
return "";
case cell::type::numeric:
case cell::type::number:
return nf.format(value<long double>(), base_date());
case cell::type::string:
case cell::type::formula:
case cell::type::inline_string:
case cell::type::shared_string:
case cell::type::formula_string:
case cell::type::error:
return nf.format(value<std::string>());
case cell::type::boolean:
@ -731,17 +713,29 @@ void cell::value(const std::string &value_string, bool infer_type)
{
value(value_string);
if (!infer_type)
if (!infer_type || value_string.empty())
{
return;
}
if (value_string.front() == '=' && value_string.size() > 1)
{
formula(value_string);
return;
}
if (value_string.front() == '#' && value_string.size() > 1)
{
error(value_string);
return;
}
auto percentage = cast_percentage(value_string);
if (percentage.first)
{
d_->value_numeric_ = percentage.second;
d_->type_ = cell::type::numeric;
d_->type_ = cell::type::number;
number_format(xlnt::number_format::percentage());
}
else
@ -750,7 +744,7 @@ void cell::value(const std::string &value_string, bool infer_type)
if (time.first)
{
d_->type_ = cell::type::numeric;
d_->type_ = cell::type::number;
number_format(number_format::date_time6());
d_->value_numeric_ = time.second.to_number();
}
@ -761,7 +755,7 @@ void cell::value(const std::string &value_string, bool infer_type)
if (numeric.first)
{
d_->value_numeric_ = numeric.second;
d_->type_ = cell::type::numeric;
d_->type_ = cell::type::number;
}
}
}

View File

@ -29,7 +29,12 @@ namespace xlnt {
namespace detail {
cell_impl::cell_impl()
: type_(cell_type::null), parent_(nullptr), column_(1), row_(1), is_merged_(false), value_numeric_(0)
: type_(cell_type::empty),
parent_(nullptr),
column_(1),
row_(1),
is_merged_(false),
value_numeric_(0)
{
}

View File

@ -1407,8 +1407,6 @@ void xlsx_consumer::read_worksheet(const std::string &rel_id)
read_namespaces();
xlnt::range_reference full_range;
const auto &shared_strings = target_.shared_strings();
auto &manifest = target_.manifest();
const auto workbook_rel = manifest.relationship(path("/"), relationship_type::office_document);
@ -1699,15 +1697,20 @@ void xlsx_consumer::read_worksheet(const std::string &rel_id)
if (has_value)
{
if (type == "inlineStr" || type == "str")
if (type == "str")
{
cell.value(value_string);
cell.d_->value_text_ = value_string;
cell.data_type(cell::type::formula_string);
}
else if (type == "s" && !has_formula)
else if (type == "inlineStr")
{
auto shared_string_index = static_cast<std::size_t>(std::stoull(value_string));
auto shared_string = shared_strings.at(shared_string_index);
cell.value(shared_string);
cell.d_->value_text_ = value_string;
cell.data_type(cell::type::inline_string);
}
else if (type == "s")
{
cell.d_->value_numeric_ = std::stold(value_string);
cell.data_type(cell::type::shared_string);
}
else if (type == "b") // boolean
{

View File

@ -777,7 +777,7 @@ void xlsx_producer::write_shared_string_table(const relationship & /*rel*/)
while (current_cell.column() <= dimension.bottom_right().column())
{
if (ws.has_cell(current_cell)
&& ws.cell(current_cell).data_type() == cell::type::string)
&& ws.cell(current_cell).data_type() == cell::type::shared_string)
{
++string_count;
}
@ -2145,10 +2145,9 @@ void xlsx_producer::write_worksheet(const relationship &rel)
}
std::unordered_map<std::string, std::string> hyperlink_references;
std::vector<cell_reference> cells_with_comments;
write_start_element(xmlns, "sheetData");
const auto &shared_strings = ws.workbook().shared_strings();
std::vector<cell_reference> cells_with_comments;
for (auto row : ws.rows())
{
@ -2206,122 +2205,124 @@ void xlsx_producer::write_worksheet(const relationship &rel)
}
}
for (auto cell : row)
for (auto cell : row) // CT_Cell
{
if (cell.garbage_collectible()) continue;
// record data about the cell needed later
if (cell.has_comment())
{
cells_with_comments.push_back(cell.reference());
}
if (!cell.garbage_collectible())
if (cell.has_hyperlink())
{
write_start_element(xmlns, "c");
write_attribute("r", cell.reference().to_string());
hyperlink_references[cell.reference().to_string()] = reverse_hyperlink_references[cell.hyperlink()];
}
if (cell.has_format())
write_start_element(xmlns, "c");
// begin cell attributes
write_attribute("r", cell.reference().to_string());
if (cell.has_format())
{
write_attribute("s", cell.format().d_->id);
}
switch (cell.data_type())
{
case cell::type::boolean:
write_attribute("t", "b");
break;
case cell::type::date:
write_attribute("t", "d");
break;
case cell::type::error:
write_attribute("t", "e");
break;
case cell::type::inline_string:
write_attribute("t", "inlineStr");
break;
case cell::type::number:
write_attribute("t", "n");
break;
case cell::type::shared_string:
write_attribute("t", "s");
break;
case cell::type::formula_string:
write_attribute("t", "str");
break;
}
//write_attribute("cm", "");
//write_attribute("vm", "");
//write_attribute("ph", "");
// begin child elements
if (cell.has_formula())
{
write_element(xmlns, "f", cell.formula());
}
switch (cell.data_type())
{
case cell::type::boolean:
write_element(xmlns, "v", write_bool(cell.value<bool>()));
break;
case cell::type::date:
write_element(xmlns, "v", cell.value<std::string>());
break;
case cell::type::error:
write_element(xmlns, "v", cell.value<std::string>());
break;
case cell::type::inline_string:
write_start_element(xmlns, "is");
// TODO: make a write_rich_text method and use that here
write_element(xmlns, "t", cell.value<std::string>());
write_end_element(xmlns, "is");
break;
case cell::type::number:
write_start_element(xmlns, "v");
if (is_integral(cell.value<long double>()))
{
write_attribute("s", cell.format().d_->id);
}
if (cell.has_hyperlink())
{
hyperlink_references[cell.reference().to_string()] = reverse_hyperlink_references[cell.hyperlink()];
}
if (cell.data_type() == cell::type::string)
{
if (cell.has_formula())
{
write_attribute("t", "str");
write_element(xmlns, "f", cell.formula());
write_element(xmlns, "v", cell.to_string());
write_end_element(xmlns, "c");
continue;
}
int match_index = -1;
for (std::size_t i = 0; i < shared_strings.size(); i++)
{
if (shared_strings[i] == cell.value<rich_text>())
{
match_index = static_cast<int>(i);
break;
}
}
if (match_index == -1)
{
if (cell.value<std::string>().empty())
{
write_attribute("t", "s");
}
else
{
write_attribute("t", "inlineStr");
write_start_element(xmlns, "is");
write_element(xmlns, "t", cell.value<std::string>());
write_end_element(xmlns, "is");
}
}
else
{
write_attribute("t", "s");
write_element(xmlns, "v", match_index);
}
write_characters(static_cast<std::ptrdiff_t>(cell.value<long double>()));
}
else
{
if (cell.data_type() != cell::type::null)
{
if (cell.data_type() == cell::type::boolean)
{
write_attribute("t", "b");
write_element(xmlns, "v", write_bool(cell.value<bool>()));
}
else if (cell.data_type() == cell::type::numeric)
{
if (cell.has_formula())
{
write_element(xmlns, "f", cell.formula());
write_element(xmlns, "v", cell.to_string());
write_end_element(xmlns, "c");
continue;
}
write_attribute("t", "n");
write_start_element(xmlns, "v");
if (is_integral(cell.value<long double>()))
{
write_characters(cell.value<long long int>());
}
else
{
std::stringstream ss;
ss.precision(20);
ss << cell.value<long double>();
ss.str();
write_characters(ss.str());
}
write_end_element(xmlns, "v");
}
}
else if (cell.has_formula())
{
write_element(xmlns, "f", cell.formula());
// todo (but probably not) could calculate the formula and set the value here
write_end_element(xmlns, "c");
continue;
}
std::stringstream ss;
ss.precision(20);
ss << cell.value<long double>();
write_characters(ss.str());
}
write_end_element(xmlns, "c");
write_end_element(xmlns, "v");
break;
case cell::type::shared_string:
write_element(xmlns, "v", static_cast<std::size_t>(cell.d_->value_numeric_));
break;
case cell::type::formula_string:
write_element(xmlns, "v", cell.value<std::string>());
break;
}
write_end_element(xmlns, "c");
}
write_end_element(xmlns, "row");

View File

@ -1285,20 +1285,29 @@ const std::vector<rich_text> &workbook::shared_strings() const
return d_->shared_strings_;
}
void workbook::add_shared_string(const rich_text &shared, bool allow_duplicates)
std::size_t workbook::add_shared_string(const rich_text &shared, bool allow_duplicates)
{
register_workbook_part(relationship_type::shared_string_table);
auto index = std::size_t(0);
if (!allow_duplicates)
{
// TODO: inefficient, use a set or something?
for (auto &s : d_->shared_strings_)
{
if (s == shared) return;
if (s == shared)
{
return index;
}
++index;
}
}
d_->shared_strings_.push_back(shared);
return index;
}
bool workbook::contains(const std::string &sheet_title) const

View File

@ -553,7 +553,7 @@ void worksheet::merge_cells(const range_reference &reference)
if (!first)
{
if (cell.data_type() == cell::type::string)
if (cell.data_type() == cell::type::shared_string)
{
cell.value("");
}
@ -672,7 +672,7 @@ bool worksheet::compare(const worksheet &other, bool reference) const
return false;
}
if (this_cell.data_type() == xlnt::cell::type::numeric
if (this_cell.data_type() == xlnt::cell::type::number
&& std::fabs(this_cell.value<long double>() - other_cell.value<long double>()) > 0.L)
{
return false;

View File

@ -121,7 +121,7 @@ private:
auto ws = wb.active_sheet();
auto cell = ws.cell(xlnt::cell_reference("A", 1));
xlnt_assert(cell.data_type() == xlnt::cell::type::null);
xlnt_assert(cell.data_type() == xlnt::cell::type::empty);
xlnt_assert(cell.column() == "A");
xlnt_assert(cell.row() == 1);
xlnt_assert(cell.reference() == "A1");
@ -134,12 +134,12 @@ private:
const auto datatypes =
{
xlnt::cell::type::null,
xlnt::cell::type::empty,
xlnt::cell::type::boolean,
xlnt::cell::type::error,
xlnt::cell::type::formula,
xlnt::cell::type::numeric,
xlnt::cell::type::string
xlnt::cell::type::formula_string,
xlnt::cell::type::number,
xlnt::cell::type::shared_string
};
for (const auto &datatype : datatypes)
@ -150,7 +150,7 @@ private:
cell.data_type(datatype);
xlnt_assert(cell.data_type() == datatype);
cell.clear_value();
xlnt_assert(cell.data_type() == xlnt::cell::type::null);
xlnt_assert(cell.data_type() == xlnt::cell::type::empty);
}
}
@ -161,13 +161,13 @@ private:
auto cell = ws.cell(xlnt::cell_reference(1, 1));
cell.value("hello");
xlnt_assert(cell.data_type() == xlnt::cell::type::string);
xlnt_assert(cell.data_type() == xlnt::cell::type::shared_string);
cell.value(".");
xlnt_assert(cell.data_type() == xlnt::cell::type::string);
xlnt_assert(cell.data_type() == xlnt::cell::type::shared_string);
cell.value("0800");
xlnt_assert(cell.data_type() == xlnt::cell::type::string);
xlnt_assert(cell.data_type() == xlnt::cell::type::shared_string);
}
void test_formula1()
@ -177,7 +177,8 @@ private:
auto cell = ws.cell(xlnt::cell_reference(1, 1));
cell.value("=42", true);
xlnt_assert(cell.data_type() == xlnt::cell::type::formula);
xlnt_assert(cell.data_type() == xlnt::cell::type::number);
xlnt_assert(cell.has_formula());
}
void test_formula2()
@ -187,7 +188,8 @@ private:
auto cell = ws.cell(xlnt::cell_reference(1, 1));
cell.value("=if(A1<4;-1;1)", true);
xlnt_assert(cell.data_type() == xlnt::cell::type::formula);
xlnt_assert(cell.data_type() == xlnt::cell::type::number);
xlnt_assert(cell.has_formula());
}
void test_formula3()
@ -213,8 +215,8 @@ private:
auto ws = wb.active_sheet();
auto cell = ws.cell(xlnt::cell_reference(1, 1));
cell.value("=");
xlnt_assert(cell.data_type() == xlnt::cell::type::string);
cell.value("=", true);
xlnt_assert(cell.data_type() == xlnt::cell::type::shared_string);
xlnt_assert(cell.value<std::string>() == "=");
xlnt_assert(!cell.has_formula());
}
@ -253,7 +255,7 @@ private:
cell.value(xlnt::datetime(2010, 7, 13, 6, 37, 41));
xlnt_assert(cell.data_type() == xlnt::cell::type::numeric);
xlnt_assert(cell.data_type() == xlnt::cell::type::number);
xlnt_assert_delta(cell.value<long double>(), 40372.27616898148L, 1E-9);
xlnt_assert(cell.is_date());
xlnt_assert(cell.number_format().format_string() == "yyyy-mm-dd h:mm:ss");
@ -266,7 +268,7 @@ private:
auto cell = ws.cell(xlnt::cell_reference(1, 1));
cell.value(xlnt::date(2010, 7, 13));
xlnt_assert(cell.data_type() == xlnt::cell::type::numeric);
xlnt_assert(cell.data_type() == xlnt::cell::type::number);
xlnt_assert(cell.value<long double>() == 40372.L);
xlnt_assert(cell.is_date());
xlnt_assert(cell.number_format().format_string() == "yyyy-mm-dd");
@ -279,7 +281,7 @@ private:
auto cell = ws.cell(xlnt::cell_reference(1, 1));
cell.value(xlnt::time(1, 3));
xlnt_assert(cell.data_type() == xlnt::cell::type::numeric);
xlnt_assert(cell.data_type() == xlnt::cell::type::number);
xlnt_assert_delta(cell.value<long double>(), 0.04375L, 1E-9);
xlnt_assert(cell.is_date());
xlnt_assert(cell.number_format().format_string() == "h:mm:ss");
@ -355,7 +357,7 @@ private:
cell.value(xlnt::timedelta(1, 3, 0, 0, 0));
xlnt_assert(cell.value<long double>() == 1.125);
xlnt_assert(cell.data_type() == xlnt::cell::type::numeric);
xlnt_assert(cell.data_type() == xlnt::cell::type::number);
xlnt_assert(!cell.is_date());
xlnt_assert(cell.number_format().format_string() == "[hh]:mm:ss");
}

View File

@ -313,7 +313,7 @@ public:
auto ws = wb.active_sheet();
xlnt::cell cell = ws[xlnt::cell_reference("A1")];
xlnt_assert_equals(cell.reference().to_string(), "A1");
xlnt_assert_equals(cell.data_type(), xlnt::cell::type::null);
xlnt_assert_equals(cell.data_type(), xlnt::cell::type::empty);
}
void test_setitem()