work on streaming cell reading, using ugly copy-pasted code for now [ci

skip]
This commit is contained in:
Thomas Fussell 2017-06-24 11:39:37 -04:00
parent 3153600768
commit 6f716c6e89
5 changed files with 201 additions and 22 deletions

View File

@ -31,6 +31,10 @@
#include <xlnt/xlnt_config.hpp> #include <xlnt/xlnt_config.hpp>
namespace xml {
class parser;
}
namespace xlnt { namespace xlnt {
class cell; class cell;
@ -73,7 +77,7 @@ public:
/// Beings reading of the next worksheet in the workbook and optionally /// Beings reading of the next worksheet in the workbook and optionally
/// returns its title if the last worksheet has not yet been read. /// returns its title if the last worksheet has not yet been read.
/// </summary> /// </summary>
std::string begin_worksheet(); void begin_worksheet();
/// <summary> /// <summary>
/// Ends reading of the current worksheet in the workbook and optionally /// Ends reading of the current worksheet in the workbook and optionally
@ -118,6 +122,11 @@ private:
std::vector<std::string> worksheet_queue_; std::vector<std::string> worksheet_queue_;
std::unique_ptr<detail::xlsx_consumer> consumer_; std::unique_ptr<detail::xlsx_consumer> consumer_;
std::unique_ptr<workbook> workbook_; std::unique_ptr<workbook> workbook_;
std::unique_ptr<std::istream> stream_;
std::unique_ptr<std::streambuf> stream_buffer_;
std::unique_ptr<std::istream> part_stream_;
std::unique_ptr<std::streambuf> part_stream_buffer_;
std::unique_ptr<xml::parser> parser_;
}; };
} // namespace xlnt } // namespace xlnt

View File

@ -147,16 +147,154 @@ void xlsx_consumer::open(std::istream &source)
} }
cell xlsx_consumer::read_cell() cell xlsx_consumer::read_cell()
{
if (!has_cell())
{ {
return cell(nullptr); return cell(nullptr);
} }
void xlsx_consumer::read_worksheet(const std::string &rel_id) auto ws = worksheet(stream_worksheet_);
if (in_element(qn("spreadsheetml", "sheetData")))
{
expect_start_element(qn("spreadsheetml", "row"), xml::content::complex); // CT_Row
auto row_index = parser().attribute<row_t>("r");
if (parser().attribute_present("ht"))
{
ws.row_properties(row_index).height = parser().attribute<double>("ht");
}
if (parser().attribute_present("customHeight"))
{
ws.row_properties(row_index).custom_height = is_true(parser().attribute("customHeight"));
}
if (parser().attribute_present("hidden") && is_true(parser().attribute("hidden")))
{
ws.row_properties(row_index).hidden = true;
}
skip_attributes({ qn("x14ac", "dyDescent") });
skip_attributes({ "customFormat", "s", "customFont",
"outlineLevel", "collapsed", "thickTop", "thickBot",
"ph", "spans" });
}
if (!in_element(qn("spreadsheetml", "row")))
{
return cell(nullptr);
}
expect_start_element(qn("spreadsheetml", "c"), xml::content::complex);
auto cell = ws.cell(cell_reference(parser().attribute("r")));
auto has_type = parser().attribute_present("t");
auto type = has_type ? parser().attribute("t") : "n";
auto has_format = parser().attribute_present("s");
auto format_id = static_cast<std::size_t>(has_format ? std::stoull(parser().attribute("s")) : 0LL);
auto has_value = false;
auto value_string = std::string();
auto has_formula = false;
auto has_shared_formula = false;
auto formula_value_string = std::string();
while (in_element(qn("spreadsheetml", "c")))
{
auto current_element = expect_start_element(xml::content::mixed);
if (current_element == qn("spreadsheetml", "v")) // s:ST_Xstring
{
has_value = true;
value_string = read_text();
}
else if (current_element == qn("spreadsheetml", "f")) // CT_CellFormula
{
has_formula = true;
if (parser().attribute_present("t"))
{
has_shared_formula = parser().attribute("t") == "shared";
}
skip_attributes(
{ "aca", "ref", "dt2D", "dtr", "del1", "del2", "r1", "r2", "ca", "si", "bx" });
formula_value_string = read_text();
}
else if (current_element == qn("spreadsheetml", "is")) // CT_Rst
{
expect_start_element(qn("spreadsheetml", "t"), xml::content::simple);
value_string = read_text();
expect_end_element(qn("spreadsheetml", "t"));
}
else
{
unexpected_element(current_element);
}
expect_end_element(current_element);
}
expect_end_element(qn("spreadsheetml", "c"));
if (has_formula && !has_shared_formula)
{
cell.formula(formula_value_string);
}
if (has_value)
{
if (type == "str")
{
cell.d_->value_text_ = value_string;
cell.data_type(cell::type::formula_string);
}
else if (type == "inlineStr")
{
cell.d_->value_text_ = value_string;
cell.data_type(cell::type::inline_string);
}
else if (type == "s")
{
cell.d_->value_numeric_ = std::stold(value_string);
cell.data_type(cell::type::shared_string);
}
else if (type == "b") // boolean
{
cell.value(is_true(value_string));
}
else if (type == "n") // numeric
{
cell.value(std::stold(value_string));
}
else if (!value_string.empty() && value_string[0] == '#')
{
cell.error(value_string);
}
}
if (has_format)
{
cell.format(target_.format(format_id));
}
return cell;
}
void xlsx_consumer::read_worksheet(const std::string &rel_id, bool streaming)
{ {
read_worksheet_begin(rel_id); read_worksheet_begin(rel_id);
if (!streaming)
{
read_worksheet_sheetdata(); read_worksheet_sheetdata();
read_worksheet_end(rel_id); read_worksheet_end(rel_id);
} }
}
std::string xlsx_consumer::read_worksheet_begin(const std::string &rel_id) std::string xlsx_consumer::read_worksheet_begin(const std::string &rel_id)
{ {
@ -874,7 +1012,8 @@ xml::parser &xlsx_consumer::parser()
bool xlsx_consumer::has_cell() bool xlsx_consumer::has_cell()
{ {
return stream_cell_ != nullptr; return in_element(qn("spreadsheetml", "row"))
|| in_element(qn("spreadsheetml", "sheetData"));
} }
std::vector<relationship> xlsx_consumer::read_relationships(const path &part) std::vector<relationship> xlsx_consumer::read_relationships(const path &part)
@ -991,7 +1130,7 @@ void xlsx_consumer::read_part(const std::vector<relationship> &rel_chain, bool s
break; break;
case relationship_type::worksheet: case relationship_type::worksheet:
read_worksheet(rel_chain.back().id()); read_worksheet(rel_chain.back().id(), streaming);
break; break;
case relationship_type::thumbnail: case relationship_type::thumbnail:

View File

@ -198,7 +198,7 @@ private:
/// <summary> /// <summary>
/// xl/sheets/*.xml /// xl/sheets/*.xml
/// </summary> /// </summary>
void read_worksheet(const std::string &rel_id); void read_worksheet(const std::string &rel_id, bool streaming);
/// <summary> /// <summary>
/// xl/sheets/*.xml /// xl/sheets/*.xml

View File

@ -1,5 +1,4 @@
// Copyright (c) 2014-2017 Thomas Fussell // Copyright (c) 2017 Thomas Fussell
// Copyright (c) 2010-2015 openpyxl
// //
// Permission is hereby granted, free of charge, to any person obtaining a copy // Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal // of this software and associated documentation files (the "Software"), to deal
@ -27,6 +26,7 @@
#include <detail/serialization/vector_streambuf.hpp> #include <detail/serialization/vector_streambuf.hpp>
#include <detail/serialization/xlsx_consumer.hpp> #include <detail/serialization/xlsx_consumer.hpp>
#include <xlnt/cell/cell.hpp> #include <xlnt/cell/cell.hpp>
#include <xlnt/packaging/manifest.hpp>
#include <xlnt/utils/optional.hpp> #include <xlnt/utils/optional.hpp>
#include <xlnt/workbook/streaming_workbook_reader.hpp> #include <xlnt/workbook/streaming_workbook_reader.hpp>
#include <xlnt/workbook/workbook.hpp> #include <xlnt/workbook/workbook.hpp>
@ -88,6 +88,7 @@ void streaming_workbook_reader::close()
if (consumer_) if (consumer_)
{ {
consumer_.reset(nullptr); consumer_.reset(nullptr);
stream_buffer_.reset(nullptr);
} }
} }
@ -106,10 +107,25 @@ bool streaming_workbook_reader::has_worksheet()
return !worksheet_queue_.empty(); return !worksheet_queue_.empty();
} }
std::string streaming_workbook_reader::begin_worksheet() void streaming_workbook_reader::begin_worksheet()
{ {
auto next_worksheet_rel = worksheet_queue_.back(); const auto next_worksheet_rel = worksheet_queue_.back();
return consumer_->read_worksheet_begin(next_worksheet_rel); const auto workbook_rel = workbook_->manifest()
.relationship(path("/"), relationship_type::office_document);
const auto worksheet_rel = workbook_->manifest()
.relationship(workbook_rel.target().path(), next_worksheet_rel);
auto rel_chain = std::vector<relationship>{ workbook_rel, worksheet_rel };
const auto &manifest = consumer_->target_.manifest();
const auto part_path = manifest.canonicalize(rel_chain);
auto part_stream_buffer = consumer_->archive_->open(part_path);
part_stream_buffer_.swap(part_stream_buffer);
part_stream_.reset(new std::istream(part_stream_buffer_.get()));
parser_.reset(new xml::parser(*part_stream_, part_path.string()));
consumer_->parser_ = parser_.get();
consumer_->read_worksheet_begin(next_worksheet_rel);
} }
worksheet streaming_workbook_reader::end_worksheet() worksheet streaming_workbook_reader::end_worksheet()
@ -121,29 +137,32 @@ worksheet streaming_workbook_reader::end_worksheet()
void streaming_workbook_reader::open(const std::vector<std::uint8_t> &data) void streaming_workbook_reader::open(const std::vector<std::uint8_t> &data)
{ {
detail::vector_istreambuf buffer(data); stream_buffer_.reset(new detail::vector_istreambuf(data));
std::istream buffer_stream(&buffer); stream_.reset(new std::istream(stream_buffer_.get()));
open(buffer_stream); open(*stream_);
} }
void streaming_workbook_reader::open(const std::string &filename) void streaming_workbook_reader::open(const std::string &filename)
{ {
std::ifstream file_stream; stream_.reset(new std::ifstream());
open_stream(file_stream, filename); open_stream((std::ifstream &)stream_, filename);
open(*stream_);
} }
#ifdef _MSC_VER #ifdef _MSC_VER
void streaming_workbook_reader::open(const std::wstring &filename) void streaming_workbook_reader::open(const std::wstring &filename)
{ {
std::ifstream file_stream; stream_.reset(new std::ifstream());
open_stream(file_stream, filename); open_stream((std::ifstream &)*stream_, filename);
open(*stream_);
} }
#endif #endif
void streaming_workbook_reader::open(const xlnt::path &filename) void streaming_workbook_reader::open(const xlnt::path &filename)
{ {
std::ifstream file_stream; stream_.reset(new std::ifstream());
open_stream(file_stream, filename.string()); open_stream((std::ifstream &)*stream_, filename.string());
open(*stream_);
} }
void streaming_workbook_reader::open(std::istream &stream) void streaming_workbook_reader::open(std::istream &stream)
@ -151,6 +170,16 @@ void streaming_workbook_reader::open(std::istream &stream)
workbook_.reset(new workbook()); workbook_.reset(new workbook());
consumer_.reset(new detail::xlsx_consumer(*workbook_)); consumer_.reset(new detail::xlsx_consumer(*workbook_));
consumer_->open(stream); consumer_->open(stream);
const auto workbook_rel = workbook_->manifest()
.relationship(path("/"), relationship_type::office_document);
const auto workbook_path = workbook_rel.target().path();
for (auto worksheet_rel : workbook_->manifest()
.relationships(workbook_path, relationship_type::worksheet))
{
worksheet_queue_.push_back(worksheet_rel.id());
}
} }
} // namespace xlnt } // namespace xlnt

View File

@ -58,6 +58,8 @@ public:
register_test(test_read_custom_properties); register_test(test_read_custom_properties);
register_test(test_round_trip_rw); register_test(test_round_trip_rw);
register_test(test_round_trip_rw_encrypted); register_test(test_round_trip_rw_encrypted);
register_test(test_streaming_read);
//register_test(test_streaming_write);
} }
bool workbook_matches_file(xlnt::workbook &wb, const xlnt::path &file) bool workbook_matches_file(xlnt::workbook &wb, const xlnt::path &file)