xLnt. shared string performance optimization.

This commit is contained in:
Andrii Tkachenko 2018-02-08 09:52:10 +01:00
parent cb55735644
commit 403605a536
8 changed files with 77 additions and 39 deletions

View File

@ -130,4 +130,18 @@ private:
std::vector<rich_text_run> runs_; std::vector<rich_text_run> runs_;
}; };
class XLNT_API rich_text_hash
{
public:
std::size_t operator()(const rich_text& k) const
{
std::size_t res = 0;
for (auto r : k.runs())
res ^= std::hash<std::string>()(r.first);
return res;
}
};
} // namespace xlnt } // namespace xlnt

View File

@ -31,8 +31,10 @@
#include <unordered_map> #include <unordered_map>
#include <utility> #include <utility>
#include <vector> #include <vector>
#include <map>
#include <xlnt/xlnt_config.hpp> #include <xlnt/xlnt_config.hpp>
#include <xlnt/cell/rich_text.hpp>
namespace xlnt { namespace xlnt {
@ -704,17 +706,27 @@ public:
/// </summary> /// </summary>
std::size_t add_shared_string(const rich_text &shared, bool allow_duplicates = false); std::size_t add_shared_string(const rich_text &shared, bool allow_duplicates = false);
/// <summary> /// <summary>
/// Returns a reference to the shared strings being used by cells /// Returns a reference to the shared string ordered by id
/// in this workbook. /// </summary>
/// </summary> const std::map<std::size_t, rich_text> &workbook::shared_strings_by_id() const;
std::vector<rich_text> &shared_strings();
/// <summary>
/// Returns a reference to the shared string related to the specified index
/// </summary>
const rich_text& workbook::shared_strings(std::size_t index) const;
/// <summary> /// <summary>
/// Returns a reference to the shared strings being used by cells /// Returns a reference to the shared strings being used by cells
/// in this workbook. /// in this workbook.
/// </summary> /// </summary>
const std::vector<rich_text> &shared_strings() const; std::unordered_map<rich_text, std::size_t, rich_text_hash> &shared_strings();
/// <summary>
/// Returns a reference to the shared strings being used by cells
/// in this workbook.
/// </summary>
const std::unordered_map<rich_text, std::size_t, rich_text_hash> &shared_strings() const;
// Thumbnail // Thumbnail

View File

@ -642,7 +642,7 @@ XLNT_API rich_text cell::value() const
{ {
if (data_type() == cell::type::shared_string) if (data_type() == cell::type::shared_string)
{ {
return workbook().shared_strings().at(static_cast<std::size_t>(d_->value_numeric_)); return workbook().shared_strings(static_cast<std::size_t>(d_->value_numeric_));
} }
return d_->value_text_; return d_->value_text_;

View File

@ -27,7 +27,10 @@ namespace xlnt {
bool rich_text_run::operator<(const rich_text_run &other) const bool rich_text_run::operator<(const rich_text_run &other) const
{ {
return first < other.first && second < other.second; if (first != other.first)
return first < other.first;
return second < other.second;
} }
bool rich_text_run::operator==(const rich_text_run &other) const bool rich_text_run::operator==(const rich_text_run &other) const

View File

@ -53,7 +53,8 @@ struct workbook_impl
workbook_impl(const workbook_impl &other) workbook_impl(const workbook_impl &other)
: active_sheet_index_(other.active_sheet_index_), : active_sheet_index_(other.active_sheet_index_),
worksheets_(other.worksheets_), worksheets_(other.worksheets_),
shared_strings_(other.shared_strings_), shared_strings_ids_(other.shared_strings_ids_),
shared_strings_values_(other.shared_strings_values_),
stylesheet_(other.stylesheet_), stylesheet_(other.stylesheet_),
manifest_(other.manifest_), manifest_(other.manifest_),
theme_(other.theme_), theme_(other.theme_),
@ -71,8 +72,8 @@ struct workbook_impl
active_sheet_index_ = other.active_sheet_index_; active_sheet_index_ = other.active_sheet_index_;
worksheets_.clear(); worksheets_.clear();
std::copy(other.worksheets_.begin(), other.worksheets_.end(), back_inserter(worksheets_)); std::copy(other.worksheets_.begin(), other.worksheets_.end(), back_inserter(worksheets_));
shared_strings_.clear(); shared_strings_ids_ = other.shared_strings_ids_;
std::copy(other.shared_strings_.begin(), other.shared_strings_.end(), std::back_inserter(shared_strings_)); shared_strings_values_ = other.shared_strings_values_;
theme_ = other.theme_; theme_ = other.theme_;
manifest_ = other.manifest_; manifest_ = other.manifest_;
@ -91,7 +92,8 @@ struct workbook_impl
optional<std::size_t> active_sheet_index_; optional<std::size_t> active_sheet_index_;
std::list<worksheet_impl> worksheets_; std::list<worksheet_impl> worksheets_;
std::vector<rich_text> shared_strings_; std::unordered_map<rich_text, std::size_t, rich_text_hash> shared_strings_ids_;
std::map<std::size_t, rich_text> shared_strings_values_;
optional<stylesheet> stylesheet_; optional<stylesheet> stylesheet_;

View File

@ -1675,18 +1675,17 @@ void xlsx_consumer::read_shared_string_table()
unique_count = parser().attribute<std::size_t>("uniqueCount"); unique_count = parser().attribute<std::size_t>("uniqueCount");
} }
auto &strings = target_.shared_strings();
while (in_element(qn("spreadsheetml", "sst"))) while (in_element(qn("spreadsheetml", "sst")))
{ {
expect_start_element(qn("spreadsheetml", "si"), xml::content::complex); expect_start_element(qn("spreadsheetml", "si"), xml::content::complex);
strings.push_back(read_rich_text(qn("spreadsheetml", "si"))); auto rt = read_rich_text(qn("spreadsheetml", "si"));
target_.add_shared_string(rt);
expect_end_element(qn("spreadsheetml", "si")); expect_end_element(qn("spreadsheetml", "si"));
} }
expect_end_element(qn("spreadsheetml", "sst")); expect_end_element(qn("spreadsheetml", "sst"));
if (has_unique_count && unique_count != strings.size()) if (has_unique_count && unique_count != target_.shared_strings().size())
{ {
throw invalid_file("sizes don't match"); throw invalid_file("sizes don't match");
} }

View File

@ -818,20 +818,20 @@ void xlsx_producer::write_shared_string_table(const relationship & /*rel*/)
#pragma clang diagnostic pop #pragma clang diagnostic pop
write_attribute("count", string_count); write_attribute("count", string_count);
write_attribute("uniqueCount", source_.shared_strings().size()); write_attribute("uniqueCount", source_.shared_strings_by_id().size());
auto has_trailing_whitespace = [](const std::string &s) auto has_trailing_whitespace = [](const std::string &s)
{ {
return !s.empty() && (s.front() == ' ' || s.back() == ' '); return !s.empty() && (s.front() == ' ' || s.back() == ' ');
}; };
for (const auto &string : source_.shared_strings()) for (const auto &string : source_.shared_strings_by_id())
{ {
if (string.runs().size() == 1 && !string.runs().at(0).second.is_set()) if (string.second.runs().size() == 1 && !string.second.runs().at(0).second.is_set())
{ {
write_start_element(xmlns, "si"); write_start_element(xmlns, "si");
write_start_element(xmlns, "t"); write_start_element(xmlns, "t");
write_characters(string.plain_text(), has_trailing_whitespace(string.plain_text())); write_characters(string.second.plain_text(), has_trailing_whitespace(string.second.plain_text()));
write_end_element(xmlns, "t"); write_end_element(xmlns, "t");
write_end_element(xmlns, "si"); write_end_element(xmlns, "si");
@ -840,7 +840,7 @@ void xlsx_producer::write_shared_string_table(const relationship & /*rel*/)
write_start_element(xmlns, "si"); write_start_element(xmlns, "si");
for (const auto &run : string.runs()) for (const auto &run : string.second.runs())
{ {
write_start_element(xmlns, "r"); write_start_element(xmlns, "r");

View File

@ -1251,39 +1251,47 @@ const manifest &workbook::manifest() const
return d_->manifest_; return d_->manifest_;
} }
std::vector<rich_text> &workbook::shared_strings() const std::map<std::size_t, rich_text> &workbook::shared_strings_by_id() const
{ {
return d_->shared_strings_; return d_->shared_strings_values_;
} }
const std::vector<rich_text> &workbook::shared_strings() const const rich_text& workbook::shared_strings(std::size_t index) const
{ {
return d_->shared_strings_; auto it = d_->shared_strings_values_.find(index);
if (it != d_->shared_strings_values_.end())
return it->second;
static rich_text empty;
return empty;
}
std::unordered_map<rich_text, std::size_t, rich_text_hash> &workbook::shared_strings()
{
return d_->shared_strings_ids_;
}
const std::unordered_map<rich_text, std::size_t, rich_text_hash> &workbook::shared_strings() const
{
return d_->shared_strings_ids_;
} }
std::size_t workbook::add_shared_string(const rich_text &shared, bool allow_duplicates) std::size_t workbook::add_shared_string(const rich_text &shared, bool allow_duplicates)
{ {
register_workbook_part(relationship_type::shared_string_table); register_workbook_part(relationship_type::shared_string_table);
auto index = std::size_t(0);
if (!allow_duplicates) if (!allow_duplicates)
{ {
// TODO: inefficient, use a set or something? auto it = d_->shared_strings_ids_.find(shared);
for (auto &s : d_->shared_strings_) if (it != d_->shared_strings_ids_.end())
{ return it->second;
if (s == shared)
{
return index;
}
++index;
}
} }
d_->shared_strings_.push_back(shared); auto sz = d_->shared_strings_ids_.size();
d_->shared_strings_ids_[shared] = sz;
d_->shared_strings_values_[sz] = shared;
return index; return sz;
} }
bool workbook::contains(const std::string &sheet_title) const bool workbook::contains(const std::string &sheet_title) const