Streaming: skip empty rows in has_cell()/read_cell()

Previously, an empty row would mess with the parser: if we're in an
empty row, our helper methods don't detect us as being in the "row" _or_
in the "sheetData". So `has_cell()` would return false when it
shouldn't. Similarly, `read_cell()` wouldn't skip rows; so `read_cell()`
would return an invalid cell when placed in an empty row, causing a
segfault when the caller tried to use the cell.

Callers must take care to call `has_next()` before `read_next()`. In
the future, perhaps we can make `read_next()` return a `std::optional`
and nix `has_next()` altogether?

[Closes #492]
This commit is contained in:
Adam Hooper 2020-07-28 15:29:12 -04:00
parent 8d2a8e161b
commit 319c4197c1
No known key found for this signature in database
GPG Key ID: AD088538FE23B8D0
4 changed files with 188 additions and 169 deletions

View File

@ -21,6 +21,7 @@
// @license: http://www.opensource.org/licenses/mit-license.php // @license: http://www.opensource.org/licenses/mit-license.php
// @author: see AUTHORS file // @author: see AUTHORS file
#include <cassert>
#include <cctype> #include <cctype>
#include <numeric> // for std::accumulate #include <numeric> // for std::accumulate
#include <sstream> #include <sstream>
@ -406,171 +407,7 @@ void xlsx_consumer::open(std::istream &source)
cell xlsx_consumer::read_cell() cell xlsx_consumer::read_cell()
{ {
if (!has_cell()) return cell(streaming_cell_.get());
{
return cell(nullptr);
}
auto ws = worksheet(current_worksheet_);
if (in_element(qn("spreadsheetml", "sheetData")))
{
expect_start_element(qn("spreadsheetml", "row"), xml::content::complex); // CT_Row
auto row_index = static_cast<row_t>(std::stoul(parser().attribute("r")));
auto &row_properties = ws.row_properties(row_index);
if (parser().attribute_present("ht"))
{
row_properties.height = converter_.deserialise(parser().attribute("ht"));
}
if (parser().attribute_present("customHeight"))
{
row_properties.custom_height = is_true(parser().attribute("customHeight"));
}
if (parser().attribute_present("hidden") && is_true(parser().attribute("hidden")))
{
row_properties.hidden = true;
}
if (parser().attribute_present(qn("x14ac", "dyDescent")))
{
row_properties.dy_descent = converter_.deserialise(parser().attribute(qn("x14ac", "dyDescent")));
}
if (parser().attribute_present("spans"))
{
row_properties.spans = parser().attribute("spans");
}
skip_attributes({"customFormat", "s", "customFont",
"outlineLevel", "collapsed", "thickTop", "thickBot",
"ph"});
}
if (!in_element(qn("spreadsheetml", "row")))
{
return cell(nullptr);
}
expect_start_element(qn("spreadsheetml", "c"), xml::content::complex);
auto cell = streaming_
? xlnt::cell(streaming_cell_.get())
: ws.cell(cell_reference(parser().attribute("r")));
auto reference = cell_reference(parser().attribute("r"));
cell.d_->parent_ = current_worksheet_;
cell.d_->column_ = reference.column_index();
cell.d_->row_ = reference.row();
if (parser().attribute_present("ph"))
{
cell.d_->phonetics_visible_ = parser().attribute<bool>("ph");
}
auto has_type = parser().attribute_present("t");
auto type = has_type ? parser().attribute("t") : "n";
if (parser().attribute_present("s"))
{
cell.format(target_.format(static_cast<std::size_t>(std::stoull(parser().attribute("s")))));
}
auto has_value = false;
auto value_string = std::string();
auto has_formula = false;
auto has_shared_formula = false;
auto formula_value_string = std::string();
while (in_element(qn("spreadsheetml", "c")))
{
auto current_element = expect_start_element(xml::content::mixed);
if (current_element == qn("spreadsheetml", "v")) // s:ST_Xstring
{
has_value = true;
value_string = read_text();
}
else if (current_element == qn("spreadsheetml", "f")) // CT_CellFormula
{
has_formula = true;
if (parser().attribute_present("t"))
{
has_shared_formula = parser().attribute("t") == "shared";
}
skip_attributes({"aca", "ref", "dt2D", "dtr", "del1",
"del2", "r1", "r2", "ca", "si", "bx"});
formula_value_string = read_text();
}
else if (current_element == qn("spreadsheetml", "is")) // CT_Rst
{
expect_start_element(qn("spreadsheetml", "t"), xml::content::simple);
has_value = true;
value_string = read_text();
expect_end_element(qn("spreadsheetml", "t"));
}
else
{
unexpected_element(current_element);
}
expect_end_element(current_element);
}
expect_end_element(qn("spreadsheetml", "c"));
if (has_formula && !has_shared_formula)
{
cell.formula(formula_value_string);
}
if (has_value)
{
if (type == "str")
{
cell.d_->value_text_ = value_string;
cell.data_type(cell::type::formula_string);
}
else if (type == "inlineStr")
{
cell.d_->value_text_ = value_string;
cell.data_type(cell::type::inline_string);
}
else if (type == "s")
{
cell.d_->value_numeric_ = converter_.deserialise(value_string);
cell.data_type(cell::type::shared_string);
}
else if (type == "b") // boolean
{
cell.value(is_true(value_string));
}
else if (type == "n") // numeric
{
cell.value(converter_.deserialise(value_string));
}
else if (!value_string.empty() && value_string[0] == '#')
{
cell.error(value_string);
}
}
if (!in_element(qn("spreadsheetml", "row")))
{
expect_end_element(qn("spreadsheetml", "row"));
if (!in_element(qn("spreadsheetml", "sheetData")))
{
expect_end_element(qn("spreadsheetml", "sheetData"));
}
}
return cell;
} }
void xlsx_consumer::read_worksheet(const std::string &rel_id) void xlsx_consumer::read_worksheet(const std::string &rel_id)
@ -1411,8 +1248,174 @@ xml::parser &xlsx_consumer::parser()
bool xlsx_consumer::has_cell() bool xlsx_consumer::has_cell()
{ {
return in_element(qn("spreadsheetml", "row")) auto ws = worksheet(current_worksheet_);
|| in_element(qn("spreadsheetml", "sheetData"));
while (streaming_cell_ // we're not at the end of the file
&& !in_element(qn("spreadsheetml", "row"))) // we're at the end of a row, or between rows
{
if (parser().peek() == xml::parser::event_type::end_element
&& stack_.back() == qn("spreadsheetml", "row"))
{
// We're at the end of a row.
expect_end_element(qn("spreadsheetml", "row"));
// ... and keep parsing.
}
if (parser().peek() == xml::parser::event_type::end_element
&& stack_.back() == qn("spreadsheetml", "sheetData"))
{
// End of sheet. Mark it by setting streaming_cell_ to nullptr, so we never get here again.
expect_end_element(qn("spreadsheetml", "sheetData"));
streaming_cell_.reset(nullptr);
break;
}
expect_start_element(qn("spreadsheetml", "row"), xml::content::complex); // CT_Row
auto row_index = static_cast<row_t>(std::stoul(parser().attribute("r")));
auto &row_properties = ws.row_properties(row_index);
if (parser().attribute_present("ht"))
{
row_properties.height = converter_.deserialise(parser().attribute("ht"));
}
if (parser().attribute_present("customHeight"))
{
row_properties.custom_height = is_true(parser().attribute("customHeight"));
}
if (parser().attribute_present("hidden") && is_true(parser().attribute("hidden")))
{
row_properties.hidden = true;
}
if (parser().attribute_present(qn("x14ac", "dyDescent")))
{
row_properties.dy_descent = converter_.deserialise(parser().attribute(qn("x14ac", "dyDescent")));
}
if (parser().attribute_present("spans"))
{
row_properties.spans = parser().attribute("spans");
}
skip_attributes({"customFormat", "s", "customFont",
"outlineLevel", "collapsed", "thickTop", "thickBot",
"ph"});
}
if (!streaming_cell_)
{
// We're at the end of the worksheet
return false;
}
expect_start_element(qn("spreadsheetml", "c"), xml::content::complex);
assert(streaming_);
auto cell = xlnt::cell(streaming_cell_.get());
auto reference = cell_reference(parser().attribute("r"));
cell.d_->parent_ = current_worksheet_;
cell.d_->column_ = reference.column_index();
cell.d_->row_ = reference.row();
if (parser().attribute_present("ph"))
{
cell.d_->phonetics_visible_ = parser().attribute<bool>("ph");
}
auto has_type = parser().attribute_present("t");
auto type = has_type ? parser().attribute("t") : "n";
if (parser().attribute_present("s"))
{
cell.format(target_.format(static_cast<std::size_t>(std::stoull(parser().attribute("s")))));
}
auto has_value = false;
auto value_string = std::string();
auto has_formula = false;
auto has_shared_formula = false;
auto formula_value_string = std::string();
while (in_element(qn("spreadsheetml", "c")))
{
auto current_element = expect_start_element(xml::content::mixed);
if (current_element == qn("spreadsheetml", "v")) // s:ST_Xstring
{
has_value = true;
value_string = read_text();
}
else if (current_element == qn("spreadsheetml", "f")) // CT_CellFormula
{
has_formula = true;
if (parser().attribute_present("t"))
{
has_shared_formula = parser().attribute("t") == "shared";
}
skip_attributes({"aca", "ref", "dt2D", "dtr", "del1",
"del2", "r1", "r2", "ca", "si", "bx"});
formula_value_string = read_text();
}
else if (current_element == qn("spreadsheetml", "is")) // CT_Rst
{
expect_start_element(qn("spreadsheetml", "t"), xml::content::simple);
has_value = true;
value_string = read_text();
expect_end_element(qn("spreadsheetml", "t"));
}
else
{
unexpected_element(current_element);
}
expect_end_element(current_element);
}
expect_end_element(qn("spreadsheetml", "c"));
if (has_formula && !has_shared_formula)
{
cell.formula(formula_value_string);
}
if (has_value)
{
if (type == "str")
{
cell.d_->value_text_ = value_string;
cell.data_type(cell::type::formula_string);
}
else if (type == "inlineStr")
{
cell.d_->value_text_ = value_string;
cell.data_type(cell::type::inline_string);
}
else if (type == "s")
{
cell.d_->value_numeric_ = converter_.deserialise(value_string);
cell.data_type(cell::type::shared_string);
}
else if (type == "b") // boolean
{
cell.value(is_true(value_string));
}
else if (type == "n") // numeric
{
cell.value(converter_.deserialise(value_string));
}
else if (!value_string.empty() && value_string[0] == '#')
{
cell.error(value_string);
}
}
return true;
} }
std::vector<relationship> xlsx_consumer::read_relationships(const path &part) std::vector<relationship> xlsx_consumer::read_relationships(const path &part)

View File

@ -413,8 +413,6 @@ private:
std::unique_ptr<detail::cell_impl> streaming_cell_; std::unique_ptr<detail::cell_impl> streaming_cell_;
detail::cell_impl *current_cell_;
detail::worksheet_impl *current_worksheet_; detail::worksheet_impl *current_worksheet_;
number_serialiser converter_; number_serialiser converter_;
}; };

Binary file not shown.

View File

@ -93,6 +93,7 @@ public:
register_test(test_load_save_german_locale); register_test(test_load_save_german_locale);
register_test(test_Issue445_inline_str_load); register_test(test_Issue445_inline_str_load);
register_test(test_Issue445_inline_str_streaming_read); register_test(test_Issue445_inline_str_streaming_read);
register_test(test_Issue492_stream_empty_row);
} }
bool workbook_matches_file(xlnt::workbook &wb, const xlnt::path &file) bool workbook_matches_file(xlnt::workbook &wb, const xlnt::path &file)
@ -733,8 +734,25 @@ public:
xlnt::streaming_workbook_reader wbr; xlnt::streaming_workbook_reader wbr;
wbr.open(path_helper::test_file("Issue445_inline_str.xlsx")); wbr.open(path_helper::test_file("Issue445_inline_str.xlsx"));
wbr.begin_worksheet("Sheet"); wbr.begin_worksheet("Sheet");
xlnt_assert(wbr.has_cell());
auto cell = wbr.read_cell(); auto cell = wbr.read_cell();
xlnt_assert_equals(cell.value<std::string>(), std::string("a")); xlnt_assert_equals(cell.value<std::string>(), std::string("a"));
} }
void test_Issue492_stream_empty_row()
{
xlnt::streaming_workbook_reader wbr;
wbr.open(path_helper::test_file("Issue492_empty_row.xlsx"));
wbr.begin_worksheet("BLS Data Series");
xlnt_assert(wbr.has_cell());
xlnt_assert_equals(wbr.read_cell().reference(), "A1");
xlnt_assert(wbr.has_cell());
xlnt_assert_equals(wbr.read_cell().reference(), "A2");
xlnt_assert(wbr.has_cell());
xlnt_assert_equals(wbr.read_cell().reference(), "A4");
xlnt_assert(wbr.has_cell());
xlnt_assert_equals(wbr.read_cell().reference(), "B4");
xlnt_assert(!wbr.has_cell());
}
}; };
static serialization_test_suite x; static serialization_test_suite x;