xLnt. shared string performance optimization.

This commit is contained in:
Andrii Tkachenko 2018-02-08 09:52:10 +01:00
parent cb55735644
commit 403605a536
8 changed files with 77 additions and 39 deletions

View File

@ -130,4 +130,18 @@ private:
std::vector<rich_text_run> runs_;
};
class XLNT_API rich_text_hash
{
public:
std::size_t operator()(const rich_text& k) const
{
std::size_t res = 0;
for (auto r : k.runs())
res ^= std::hash<std::string>()(r.first);
return res;
}
};
} // namespace xlnt

View File

@ -31,8 +31,10 @@
#include <unordered_map>
#include <utility>
#include <vector>
#include <map>
#include <xlnt/xlnt_config.hpp>
#include <xlnt/cell/rich_text.hpp>
namespace xlnt {
@ -704,17 +706,27 @@ public:
/// </summary>
std::size_t add_shared_string(const rich_text &shared, bool allow_duplicates = false);
/// <summary>
/// Returns a reference to the shared strings being used by cells
/// in this workbook.
/// </summary>
std::vector<rich_text> &shared_strings();
/// <summary>
/// Returns a reference to the shared string ordered by id
/// </summary>
const std::map<std::size_t, rich_text> &workbook::shared_strings_by_id() const;
/// <summary>
/// Returns a reference to the shared string related to the specified index
/// </summary>
const rich_text& workbook::shared_strings(std::size_t index) const;
/// <summary>
/// Returns a reference to the shared strings being used by cells
/// in this workbook.
/// </summary>
const std::vector<rich_text> &shared_strings() const;
std::unordered_map<rich_text, std::size_t, rich_text_hash> &shared_strings();
/// <summary>
/// Returns a reference to the shared strings being used by cells
/// in this workbook.
/// </summary>
const std::unordered_map<rich_text, std::size_t, rich_text_hash> &shared_strings() const;
// Thumbnail

View File

@ -642,7 +642,7 @@ XLNT_API rich_text cell::value() const
{
if (data_type() == cell::type::shared_string)
{
return workbook().shared_strings().at(static_cast<std::size_t>(d_->value_numeric_));
return workbook().shared_strings(static_cast<std::size_t>(d_->value_numeric_));
}
return d_->value_text_;

View File

@ -27,7 +27,10 @@ namespace xlnt {
bool rich_text_run::operator<(const rich_text_run &other) const
{
return first < other.first && second < other.second;
if (first != other.first)
return first < other.first;
return second < other.second;
}
bool rich_text_run::operator==(const rich_text_run &other) const

View File

@ -53,7 +53,8 @@ struct workbook_impl
workbook_impl(const workbook_impl &other)
: active_sheet_index_(other.active_sheet_index_),
worksheets_(other.worksheets_),
shared_strings_(other.shared_strings_),
shared_strings_ids_(other.shared_strings_ids_),
shared_strings_values_(other.shared_strings_values_),
stylesheet_(other.stylesheet_),
manifest_(other.manifest_),
theme_(other.theme_),
@ -71,8 +72,8 @@ struct workbook_impl
active_sheet_index_ = other.active_sheet_index_;
worksheets_.clear();
std::copy(other.worksheets_.begin(), other.worksheets_.end(), back_inserter(worksheets_));
shared_strings_.clear();
std::copy(other.shared_strings_.begin(), other.shared_strings_.end(), std::back_inserter(shared_strings_));
shared_strings_ids_ = other.shared_strings_ids_;
shared_strings_values_ = other.shared_strings_values_;
theme_ = other.theme_;
manifest_ = other.manifest_;
@ -91,7 +92,8 @@ struct workbook_impl
optional<std::size_t> active_sheet_index_;
std::list<worksheet_impl> worksheets_;
std::vector<rich_text> shared_strings_;
std::unordered_map<rich_text, std::size_t, rich_text_hash> shared_strings_ids_;
std::map<std::size_t, rich_text> shared_strings_values_;
optional<stylesheet> stylesheet_;

View File

@ -1675,18 +1675,17 @@ void xlsx_consumer::read_shared_string_table()
unique_count = parser().attribute<std::size_t>("uniqueCount");
}
auto &strings = target_.shared_strings();
while (in_element(qn("spreadsheetml", "sst")))
{
expect_start_element(qn("spreadsheetml", "si"), xml::content::complex);
strings.push_back(read_rich_text(qn("spreadsheetml", "si")));
auto rt = read_rich_text(qn("spreadsheetml", "si"));
target_.add_shared_string(rt);
expect_end_element(qn("spreadsheetml", "si"));
}
expect_end_element(qn("spreadsheetml", "sst"));
if (has_unique_count && unique_count != strings.size())
if (has_unique_count && unique_count != target_.shared_strings().size())
{
throw invalid_file("sizes don't match");
}

View File

@ -818,20 +818,20 @@ void xlsx_producer::write_shared_string_table(const relationship & /*rel*/)
#pragma clang diagnostic pop
write_attribute("count", string_count);
write_attribute("uniqueCount", source_.shared_strings().size());
write_attribute("uniqueCount", source_.shared_strings_by_id().size());
auto has_trailing_whitespace = [](const std::string &s)
{
return !s.empty() && (s.front() == ' ' || s.back() == ' ');
};
for (const auto &string : source_.shared_strings())
for (const auto &string : source_.shared_strings_by_id())
{
if (string.runs().size() == 1 && !string.runs().at(0).second.is_set())
if (string.second.runs().size() == 1 && !string.second.runs().at(0).second.is_set())
{
write_start_element(xmlns, "si");
write_start_element(xmlns, "t");
write_characters(string.plain_text(), has_trailing_whitespace(string.plain_text()));
write_characters(string.second.plain_text(), has_trailing_whitespace(string.second.plain_text()));
write_end_element(xmlns, "t");
write_end_element(xmlns, "si");
@ -840,7 +840,7 @@ void xlsx_producer::write_shared_string_table(const relationship & /*rel*/)
write_start_element(xmlns, "si");
for (const auto &run : string.runs())
for (const auto &run : string.second.runs())
{
write_start_element(xmlns, "r");

View File

@ -1251,39 +1251,47 @@ const manifest &workbook::manifest() const
return d_->manifest_;
}
std::vector<rich_text> &workbook::shared_strings()
const std::map<std::size_t, rich_text> &workbook::shared_strings_by_id() const
{
return d_->shared_strings_;
return d_->shared_strings_values_;
}
const std::vector<rich_text> &workbook::shared_strings() const
const rich_text& workbook::shared_strings(std::size_t index) const
{
return d_->shared_strings_;
auto it = d_->shared_strings_values_.find(index);
if (it != d_->shared_strings_values_.end())
return it->second;
static rich_text empty;
return empty;
}
std::unordered_map<rich_text, std::size_t, rich_text_hash> &workbook::shared_strings()
{
return d_->shared_strings_ids_;
}
const std::unordered_map<rich_text, std::size_t, rich_text_hash> &workbook::shared_strings() const
{
return d_->shared_strings_ids_;
}
std::size_t workbook::add_shared_string(const rich_text &shared, bool allow_duplicates)
{
register_workbook_part(relationship_type::shared_string_table);
auto index = std::size_t(0);
if (!allow_duplicates)
{
// TODO: inefficient, use a set or something?
for (auto &s : d_->shared_strings_)
{
if (s == shared)
{
return index;
}
++index;
}
auto it = d_->shared_strings_ids_.find(shared);
if (it != d_->shared_strings_ids_.end())
return it->second;
}
d_->shared_strings_.push_back(shared);
auto sz = d_->shared_strings_ids_.size();
d_->shared_strings_ids_[shared] = sz;
d_->shared_strings_values_[sz] = shared;
return index;
return sz;
}
bool workbook::contains(const std::string &sheet_title) const