From 403605a53688746034bc3281f3ace1c8c1411432 Mon Sep 17 00:00:00 2001 From: Andrii Tkachenko Date: Thu, 8 Feb 2018 09:52:10 +0100 Subject: [PATCH] xLnt. shared string performance optimization. --- include/xlnt/cell/rich_text.hpp | 14 ++++++ include/xlnt/workbook/workbook.hpp | 24 +++++++--- source/cell/cell.cpp | 2 +- source/cell/rich_text_run.cpp | 5 ++- .../detail/implementations/workbook_impl.hpp | 10 +++-- source/detail/serialization/xlsx_consumer.cpp | 7 ++- source/detail/serialization/xlsx_producer.cpp | 10 ++--- source/workbook/workbook.cpp | 44 +++++++++++-------- 8 files changed, 77 insertions(+), 39 deletions(-) diff --git a/include/xlnt/cell/rich_text.hpp b/include/xlnt/cell/rich_text.hpp index fa1cb486..3eedd30f 100644 --- a/include/xlnt/cell/rich_text.hpp +++ b/include/xlnt/cell/rich_text.hpp @@ -130,4 +130,18 @@ private: std::vector runs_; }; +class XLNT_API rich_text_hash +{ +public: + std::size_t operator()(const rich_text& k) const + { + std::size_t res = 0; + + for (auto r : k.runs()) + res ^= std::hash()(r.first); + + return res; + } +}; + } // namespace xlnt diff --git a/include/xlnt/workbook/workbook.hpp b/include/xlnt/workbook/workbook.hpp index fd2dcc57..b6e04a3d 100644 --- a/include/xlnt/workbook/workbook.hpp +++ b/include/xlnt/workbook/workbook.hpp @@ -31,8 +31,10 @@ #include #include #include +#include #include +#include namespace xlnt { @@ -704,17 +706,27 @@ public: /// std::size_t add_shared_string(const rich_text &shared, bool allow_duplicates = false); - /// - /// Returns a reference to the shared strings being used by cells - /// in this workbook. - /// - std::vector &shared_strings(); + /// + /// Returns a reference to the shared string ordered by id + /// + const std::map &workbook::shared_strings_by_id() const; + + /// + /// Returns a reference to the shared string related to the specified index + /// + const rich_text& workbook::shared_strings(std::size_t index) const; /// /// Returns a reference to the shared strings being used by cells /// in this workbook. /// - const std::vector &shared_strings() const; + std::unordered_map &shared_strings(); + + /// + /// Returns a reference to the shared strings being used by cells + /// in this workbook. + /// + const std::unordered_map &shared_strings() const; // Thumbnail diff --git a/source/cell/cell.cpp b/source/cell/cell.cpp index 6a0545a3..90cae972 100644 --- a/source/cell/cell.cpp +++ b/source/cell/cell.cpp @@ -642,7 +642,7 @@ XLNT_API rich_text cell::value() const { if (data_type() == cell::type::shared_string) { - return workbook().shared_strings().at(static_cast(d_->value_numeric_)); + return workbook().shared_strings(static_cast(d_->value_numeric_)); } return d_->value_text_; diff --git a/source/cell/rich_text_run.cpp b/source/cell/rich_text_run.cpp index 00737fd4..a63859b5 100644 --- a/source/cell/rich_text_run.cpp +++ b/source/cell/rich_text_run.cpp @@ -27,7 +27,10 @@ namespace xlnt { bool rich_text_run::operator<(const rich_text_run &other) const { - return first < other.first && second < other.second; + if (first != other.first) + return first < other.first; + + return second < other.second; } bool rich_text_run::operator==(const rich_text_run &other) const diff --git a/source/detail/implementations/workbook_impl.hpp b/source/detail/implementations/workbook_impl.hpp index 64d15cf7..b22c7670 100644 --- a/source/detail/implementations/workbook_impl.hpp +++ b/source/detail/implementations/workbook_impl.hpp @@ -53,7 +53,8 @@ struct workbook_impl workbook_impl(const workbook_impl &other) : active_sheet_index_(other.active_sheet_index_), worksheets_(other.worksheets_), - shared_strings_(other.shared_strings_), + shared_strings_ids_(other.shared_strings_ids_), + shared_strings_values_(other.shared_strings_values_), stylesheet_(other.stylesheet_), manifest_(other.manifest_), theme_(other.theme_), @@ -71,8 +72,8 @@ struct workbook_impl active_sheet_index_ = other.active_sheet_index_; worksheets_.clear(); std::copy(other.worksheets_.begin(), other.worksheets_.end(), back_inserter(worksheets_)); - shared_strings_.clear(); - std::copy(other.shared_strings_.begin(), other.shared_strings_.end(), std::back_inserter(shared_strings_)); + shared_strings_ids_ = other.shared_strings_ids_; + shared_strings_values_ = other.shared_strings_values_; theme_ = other.theme_; manifest_ = other.manifest_; @@ -91,7 +92,8 @@ struct workbook_impl optional active_sheet_index_; std::list worksheets_; - std::vector shared_strings_; + std::unordered_map shared_strings_ids_; + std::map shared_strings_values_; optional stylesheet_; diff --git a/source/detail/serialization/xlsx_consumer.cpp b/source/detail/serialization/xlsx_consumer.cpp index c7b8bc81..5d685ff9 100644 --- a/source/detail/serialization/xlsx_consumer.cpp +++ b/source/detail/serialization/xlsx_consumer.cpp @@ -1675,18 +1675,17 @@ void xlsx_consumer::read_shared_string_table() unique_count = parser().attribute("uniqueCount"); } - auto &strings = target_.shared_strings(); - while (in_element(qn("spreadsheetml", "sst"))) { expect_start_element(qn("spreadsheetml", "si"), xml::content::complex); - strings.push_back(read_rich_text(qn("spreadsheetml", "si"))); + auto rt = read_rich_text(qn("spreadsheetml", "si")); + target_.add_shared_string(rt); expect_end_element(qn("spreadsheetml", "si")); } expect_end_element(qn("spreadsheetml", "sst")); - if (has_unique_count && unique_count != strings.size()) + if (has_unique_count && unique_count != target_.shared_strings().size()) { throw invalid_file("sizes don't match"); } diff --git a/source/detail/serialization/xlsx_producer.cpp b/source/detail/serialization/xlsx_producer.cpp index 5491ccb6..8040c01a 100644 --- a/source/detail/serialization/xlsx_producer.cpp +++ b/source/detail/serialization/xlsx_producer.cpp @@ -818,20 +818,20 @@ void xlsx_producer::write_shared_string_table(const relationship & /*rel*/) #pragma clang diagnostic pop write_attribute("count", string_count); - write_attribute("uniqueCount", source_.shared_strings().size()); + write_attribute("uniqueCount", source_.shared_strings_by_id().size()); auto has_trailing_whitespace = [](const std::string &s) { return !s.empty() && (s.front() == ' ' || s.back() == ' '); }; - for (const auto &string : source_.shared_strings()) + for (const auto &string : source_.shared_strings_by_id()) { - if (string.runs().size() == 1 && !string.runs().at(0).second.is_set()) + if (string.second.runs().size() == 1 && !string.second.runs().at(0).second.is_set()) { write_start_element(xmlns, "si"); write_start_element(xmlns, "t"); - write_characters(string.plain_text(), has_trailing_whitespace(string.plain_text())); + write_characters(string.second.plain_text(), has_trailing_whitespace(string.second.plain_text())); write_end_element(xmlns, "t"); write_end_element(xmlns, "si"); @@ -840,7 +840,7 @@ void xlsx_producer::write_shared_string_table(const relationship & /*rel*/) write_start_element(xmlns, "si"); - for (const auto &run : string.runs()) + for (const auto &run : string.second.runs()) { write_start_element(xmlns, "r"); diff --git a/source/workbook/workbook.cpp b/source/workbook/workbook.cpp index adb97d51..def108a3 100644 --- a/source/workbook/workbook.cpp +++ b/source/workbook/workbook.cpp @@ -1251,39 +1251,47 @@ const manifest &workbook::manifest() const return d_->manifest_; } -std::vector &workbook::shared_strings() +const std::map &workbook::shared_strings_by_id() const { - return d_->shared_strings_; + return d_->shared_strings_values_; } -const std::vector &workbook::shared_strings() const +const rich_text& workbook::shared_strings(std::size_t index) const { - return d_->shared_strings_; + auto it = d_->shared_strings_values_.find(index); + if (it != d_->shared_strings_values_.end()) + return it->second; + + static rich_text empty; + return empty; +} + +std::unordered_map &workbook::shared_strings() +{ + return d_->shared_strings_ids_; +} + +const std::unordered_map &workbook::shared_strings() const +{ + return d_->shared_strings_ids_; } std::size_t workbook::add_shared_string(const rich_text &shared, bool allow_duplicates) { register_workbook_part(relationship_type::shared_string_table); - auto index = std::size_t(0); - if (!allow_duplicates) { - // TODO: inefficient, use a set or something? - for (auto &s : d_->shared_strings_) - { - if (s == shared) - { - return index; - } - - ++index; - } + auto it = d_->shared_strings_ids_.find(shared); + if (it != d_->shared_strings_ids_.end()) + return it->second; } - d_->shared_strings_.push_back(shared); + auto sz = d_->shared_strings_ids_.size(); + d_->shared_strings_ids_[shared] = sz; + d_->shared_strings_values_[sz] = shared; - return index; + return sz; } bool workbook::contains(const std::string &sheet_title) const