From 319c4197c19db4fa0f2e8998bd9ef56a5b20a865 Mon Sep 17 00:00:00 2001 From: Adam Hooper Date: Tue, 28 Jul 2020 15:29:12 -0400 Subject: [PATCH] Streaming: skip empty rows in has_cell()/read_cell() Previously, an empty row would mess with the parser: if we're in an empty row, our helper methods don't detect us as being in the "row" _or_ in the "sheetData". So `has_cell()` would return false when it shouldn't. Similarly, `read_cell()` wouldn't skip rows; so `read_cell()` would return an invalid cell when placed in an empty row, causing a segfault when the caller tried to use the cell. Callers must take care to call `has_next()` before `read_next()`. In the future, perhaps we can make `read_next()` return a `std::optional` and nix `has_next()` altogether? [Closes #492] --- source/detail/serialization/xlsx_consumer.cpp | 337 +++++++++--------- source/detail/serialization/xlsx_consumer.hpp | 2 - tests/data/Issue492_empty_row.xlsx | Bin 0 -> 4573 bytes tests/workbook/serialization_test_suite.cpp | 18 + 4 files changed, 188 insertions(+), 169 deletions(-) create mode 100644 tests/data/Issue492_empty_row.xlsx diff --git a/source/detail/serialization/xlsx_consumer.cpp b/source/detail/serialization/xlsx_consumer.cpp index 62d9c418..071c92c3 100644 --- a/source/detail/serialization/xlsx_consumer.cpp +++ b/source/detail/serialization/xlsx_consumer.cpp @@ -21,6 +21,7 @@ // @license: http://www.opensource.org/licenses/mit-license.php // @author: see AUTHORS file +#include #include #include // for std::accumulate #include @@ -406,171 +407,7 @@ void xlsx_consumer::open(std::istream &source) cell xlsx_consumer::read_cell() { - if (!has_cell()) - { - return cell(nullptr); - } - - auto ws = worksheet(current_worksheet_); - - if (in_element(qn("spreadsheetml", "sheetData"))) - { - expect_start_element(qn("spreadsheetml", "row"), xml::content::complex); // CT_Row - auto row_index = static_cast(std::stoul(parser().attribute("r"))); - auto &row_properties = ws.row_properties(row_index); - - if (parser().attribute_present("ht")) - { - row_properties.height = converter_.deserialise(parser().attribute("ht")); - } - - if (parser().attribute_present("customHeight")) - { - row_properties.custom_height = is_true(parser().attribute("customHeight")); - } - - if (parser().attribute_present("hidden") && is_true(parser().attribute("hidden"))) - { - row_properties.hidden = true; - } - - if (parser().attribute_present(qn("x14ac", "dyDescent"))) - { - row_properties.dy_descent = converter_.deserialise(parser().attribute(qn("x14ac", "dyDescent"))); - } - - if (parser().attribute_present("spans")) - { - row_properties.spans = parser().attribute("spans"); - } - - skip_attributes({"customFormat", "s", "customFont", - "outlineLevel", "collapsed", "thickTop", "thickBot", - "ph"}); - } - - if (!in_element(qn("spreadsheetml", "row"))) - { - return cell(nullptr); - } - - expect_start_element(qn("spreadsheetml", "c"), xml::content::complex); - - auto cell = streaming_ - ? xlnt::cell(streaming_cell_.get()) - : ws.cell(cell_reference(parser().attribute("r"))); - auto reference = cell_reference(parser().attribute("r")); - cell.d_->parent_ = current_worksheet_; - cell.d_->column_ = reference.column_index(); - cell.d_->row_ = reference.row(); - - if (parser().attribute_present("ph")) - { - cell.d_->phonetics_visible_ = parser().attribute("ph"); - } - - auto has_type = parser().attribute_present("t"); - auto type = has_type ? parser().attribute("t") : "n"; - - if (parser().attribute_present("s")) - { - cell.format(target_.format(static_cast(std::stoull(parser().attribute("s"))))); - } - - auto has_value = false; - auto value_string = std::string(); - - auto has_formula = false; - auto has_shared_formula = false; - auto formula_value_string = std::string(); - - while (in_element(qn("spreadsheetml", "c"))) - { - auto current_element = expect_start_element(xml::content::mixed); - - if (current_element == qn("spreadsheetml", "v")) // s:ST_Xstring - { - has_value = true; - value_string = read_text(); - } - else if (current_element == qn("spreadsheetml", "f")) // CT_CellFormula - { - has_formula = true; - - if (parser().attribute_present("t")) - { - has_shared_formula = parser().attribute("t") == "shared"; - } - - skip_attributes({"aca", "ref", "dt2D", "dtr", "del1", - "del2", "r1", "r2", "ca", "si", "bx"}); - - formula_value_string = read_text(); - } - else if (current_element == qn("spreadsheetml", "is")) // CT_Rst - { - expect_start_element(qn("spreadsheetml", "t"), xml::content::simple); - has_value = true; - value_string = read_text(); - expect_end_element(qn("spreadsheetml", "t")); - } - else - { - unexpected_element(current_element); - } - - expect_end_element(current_element); - } - - expect_end_element(qn("spreadsheetml", "c")); - - if (has_formula && !has_shared_formula) - { - cell.formula(formula_value_string); - } - - if (has_value) - { - if (type == "str") - { - cell.d_->value_text_ = value_string; - cell.data_type(cell::type::formula_string); - } - else if (type == "inlineStr") - { - cell.d_->value_text_ = value_string; - cell.data_type(cell::type::inline_string); - } - else if (type == "s") - { - cell.d_->value_numeric_ = converter_.deserialise(value_string); - cell.data_type(cell::type::shared_string); - } - else if (type == "b") // boolean - { - cell.value(is_true(value_string)); - } - else if (type == "n") // numeric - { - cell.value(converter_.deserialise(value_string)); - } - else if (!value_string.empty() && value_string[0] == '#') - { - cell.error(value_string); - } - } - - if (!in_element(qn("spreadsheetml", "row"))) - { - expect_end_element(qn("spreadsheetml", "row")); - - if (!in_element(qn("spreadsheetml", "sheetData"))) - { - expect_end_element(qn("spreadsheetml", "sheetData")); - } - } - - return cell; + return cell(streaming_cell_.get()); } void xlsx_consumer::read_worksheet(const std::string &rel_id) @@ -1411,8 +1248,174 @@ xml::parser &xlsx_consumer::parser() bool xlsx_consumer::has_cell() { - return in_element(qn("spreadsheetml", "row")) - || in_element(qn("spreadsheetml", "sheetData")); + auto ws = worksheet(current_worksheet_); + + while (streaming_cell_ // we're not at the end of the file + && !in_element(qn("spreadsheetml", "row"))) // we're at the end of a row, or between rows + { + if (parser().peek() == xml::parser::event_type::end_element + && stack_.back() == qn("spreadsheetml", "row")) + { + // We're at the end of a row. + expect_end_element(qn("spreadsheetml", "row")); + // ... and keep parsing. + } + + if (parser().peek() == xml::parser::event_type::end_element + && stack_.back() == qn("spreadsheetml", "sheetData")) + { + // End of sheet. Mark it by setting streaming_cell_ to nullptr, so we never get here again. + expect_end_element(qn("spreadsheetml", "sheetData")); + streaming_cell_.reset(nullptr); + break; + } + + expect_start_element(qn("spreadsheetml", "row"), xml::content::complex); // CT_Row + auto row_index = static_cast(std::stoul(parser().attribute("r"))); + auto &row_properties = ws.row_properties(row_index); + + if (parser().attribute_present("ht")) + { + row_properties.height = converter_.deserialise(parser().attribute("ht")); + } + + if (parser().attribute_present("customHeight")) + { + row_properties.custom_height = is_true(parser().attribute("customHeight")); + } + + if (parser().attribute_present("hidden") && is_true(parser().attribute("hidden"))) + { + row_properties.hidden = true; + } + + if (parser().attribute_present(qn("x14ac", "dyDescent"))) + { + row_properties.dy_descent = converter_.deserialise(parser().attribute(qn("x14ac", "dyDescent"))); + } + + if (parser().attribute_present("spans")) + { + row_properties.spans = parser().attribute("spans"); + } + + skip_attributes({"customFormat", "s", "customFont", + "outlineLevel", "collapsed", "thickTop", "thickBot", + "ph"}); + } + + if (!streaming_cell_) + { + // We're at the end of the worksheet + return false; + } + + expect_start_element(qn("spreadsheetml", "c"), xml::content::complex); + + assert(streaming_); + auto cell = xlnt::cell(streaming_cell_.get()); + auto reference = cell_reference(parser().attribute("r")); + cell.d_->parent_ = current_worksheet_; + cell.d_->column_ = reference.column_index(); + cell.d_->row_ = reference.row(); + + if (parser().attribute_present("ph")) + { + cell.d_->phonetics_visible_ = parser().attribute("ph"); + } + + auto has_type = parser().attribute_present("t"); + auto type = has_type ? parser().attribute("t") : "n"; + + if (parser().attribute_present("s")) + { + cell.format(target_.format(static_cast(std::stoull(parser().attribute("s"))))); + } + + auto has_value = false; + auto value_string = std::string(); + + auto has_formula = false; + auto has_shared_formula = false; + auto formula_value_string = std::string(); + + while (in_element(qn("spreadsheetml", "c"))) + { + auto current_element = expect_start_element(xml::content::mixed); + + if (current_element == qn("spreadsheetml", "v")) // s:ST_Xstring + { + has_value = true; + value_string = read_text(); + } + else if (current_element == qn("spreadsheetml", "f")) // CT_CellFormula + { + has_formula = true; + + if (parser().attribute_present("t")) + { + has_shared_formula = parser().attribute("t") == "shared"; + } + + skip_attributes({"aca", "ref", "dt2D", "dtr", "del1", + "del2", "r1", "r2", "ca", "si", "bx"}); + + formula_value_string = read_text(); + } + else if (current_element == qn("spreadsheetml", "is")) // CT_Rst + { + expect_start_element(qn("spreadsheetml", "t"), xml::content::simple); + has_value = true; + value_string = read_text(); + expect_end_element(qn("spreadsheetml", "t")); + } + else + { + unexpected_element(current_element); + } + + expect_end_element(current_element); + } + + expect_end_element(qn("spreadsheetml", "c")); + + if (has_formula && !has_shared_formula) + { + cell.formula(formula_value_string); + } + + if (has_value) + { + if (type == "str") + { + cell.d_->value_text_ = value_string; + cell.data_type(cell::type::formula_string); + } + else if (type == "inlineStr") + { + cell.d_->value_text_ = value_string; + cell.data_type(cell::type::inline_string); + } + else if (type == "s") + { + cell.d_->value_numeric_ = converter_.deserialise(value_string); + cell.data_type(cell::type::shared_string); + } + else if (type == "b") // boolean + { + cell.value(is_true(value_string)); + } + else if (type == "n") // numeric + { + cell.value(converter_.deserialise(value_string)); + } + else if (!value_string.empty() && value_string[0] == '#') + { + cell.error(value_string); + } + } + + return true; } std::vector xlsx_consumer::read_relationships(const path &part) diff --git a/source/detail/serialization/xlsx_consumer.hpp b/source/detail/serialization/xlsx_consumer.hpp index c9a987ee..2dbafa01 100644 --- a/source/detail/serialization/xlsx_consumer.hpp +++ b/source/detail/serialization/xlsx_consumer.hpp @@ -413,8 +413,6 @@ private: std::unique_ptr streaming_cell_; - detail::cell_impl *current_cell_; - detail::worksheet_impl *current_worksheet_; number_serialiser converter_; }; diff --git a/tests/data/Issue492_empty_row.xlsx b/tests/data/Issue492_empty_row.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..32b6da5aaee2633288a1b6f8e3bf43fc6d0e4874 GIT binary patch literal 4573 zcmaJ^1z42p5?*p)NnxcqA|VSZp|CE^(y(-QcN`Fw66vlb1zf-a;Ydn|(y1Z}(jYA% zp)`^LcNe_}F6Vk?pTC}Ozjr4fQwDsBz&reWUr0iVQW)xeL45nNfM zX07`Z?^Ur|YPvC>dj+h?FFV>HM(o6tnw0V@vs1UMek4!!eP($dv)&=i4yh{3^;9|v z-h?>uvEG<(p3Ze|jtVQaXsJ)s;>QUX{W^Yo(HS>$qu;h0E~l`Le=kEYT9o&8xh>G1 zl_SB)CHCsGwT2!|_N>inoj}Pw!j|q&4wl9Kb#R=y1D0hsiJ7WYAN3SOvh=oz87etRHwCwk2t#T+>w2N?(9fno6iFJAlf(yMDq!LHnJ^-Np+dzbq24fOk)WRx%NZv{PfKszaz(U%v7!J1>Tb#5$wGmL3Q2C>d2wB|^sH-)IE$!M88qUu_5KGFixzA^79vE)9CX8xkaiLf>6phw09Dwvq? zcm@zmCA}CZYF2NVyf;C8N%eY`v9w-24i+0dh$-6)#&bE`f5V0n^ABrRD|L5QHxFJ5 zH#h7FRY5g_TKFhsRs&?)>&KViGQFuGL=V2fC0KGtq210>f!QOiv@ z)$;yor`J%-bkEGJ|3svPX0ewe`|83EbEJyz8wmWmipWAq9>u8V9XUOfWbIzq7|g0C z97inr`m)nVq&Wh>>!;mf$3MvL%dZ+uDL)i^aYF}5BxJ?;h%ZVPAIHzgk(OC(SIObn zj2aorNlm`+Q_LI*%(e=6)>WZJ(O@>WArSz~#_x zDu)zGP3GmNNC<8fyKu*vMOmKNJFDBiFHHKb*!>W-@$L8os}mN+ zpUIPSl01^DhcWHmXs?uC$y2_Lqp<`X7-26b8$EgugkMBOL@r`bH8Y={`zh;-Vm<3R zU1)Gkk)(xN8Dp#Yl<%>R2Qae=g67OBn!Wk$Z5uZQ0dz{Xw|?-OXjj_%48B>_OAb}Z zS`5hY6?0&JVL{CR@?B7sx?`};XcxexmZASBYfWNzcqgYda>3M=bfcA~bArlC*Oq=Q2OFgfAp@~N)TrM(A!bLxTn0(aI|UDgS) z*sOKwNAzPn=lj=eq5FxAwY!D4!=H8@yuV)fvG8!jAwZ#gl#)AQvKju_!?Ik>3vzct zwgJt`@#f+m!j{$<7dF=aaLwp?RmfZBEM%}CLKf4Te3+(7vO2W6ALLL+ns4XP{-Iz_ zQCXVg@t`H84KX+yS*`B4og!2-lBl=!4F!!6nW${Y0biNtpd8q91-!k4HQlT4Im#1b zSP16&eOSRiP4}?3aJRA6Lb_w^$C~Y#(&L)RNBL%QM=#6A@aYxFDjfp??Kp130?%Uf z4Tl81S5#_mY{hF0j}C;3^Ax;V7$^4*{JkT&N{-^GOxvG#7btJ|rU9O*?nsf@^op0* zzMn1FSLZWYsMM-b7M~-AaU2Odl859&cv6aA^x2@@Up~FyTf(W?uu;?*HQ0#0Jok=r zyN;tgN14{ZD+vYB*1UMFphdUy@QyBmxY$>UAYS*I+$8f|=Pdr*YtHRN(-o{QbdpxY zRvSdv2p_Ar2pWc&L72nkC$gWf@U|-TRZ-jzgnH7X1uXfqmv1{)ht?d=3~L64972k6 zi5efu>od@Yg?KgGm*)IN-d)NJ6b7mh?m%640w+Aw#WN_HeOO;@e~rV-3#|0Atn2~n zB8~Tj3E5pnPPR10XYU@AF7`Td!AZ?@U zqaaLDzQ+3%LCli~!0-1%eY!xbZwqrIzc2v+WWOSa^mVp53Ei@R2_|%4|J72-2~$0B zDVF<01G-#&rrW)$BZVYa${d}ovv?LuAI2-*R%H2_CI|08IGc5eiN7T`PLa9wHA`PD zDm|lz-mR3`{4)AExbHRTkGh9}=%CD9Ztm9+k7_xjT9Sz?vx1*ezP20XN#W?Q@a+W2 z%-xGyX95Y$lGI*bB@=w;LR$XFSbZeHKS(W$w^m zl=g&vcM9Og%i|M&FicE38a8Cot{7Vm6dvbT4vJ8XeL?YcrPs@{p2n5erxGznwr%v` zaABRtlcJ{H1vupgEv0Cv2)!~f*nR_bLoSo|$vhr&bk4@7J0UVu)pus7<>+4Wsq`=? z9w`c5{g5O#E){YKrrS*VCBqpxtcBwl>3mx#FHR44>)KM^>|P5$dlNKorzStJo3dGN zK_#^-pu_5RU1$->+&YApi55ljb`3VMs__V20JfAeReDT;WI=%-_|r%ppze6%B-ox1 zl4BUt!Yq@YC=oE@*4F`RKm@!C#QwoZsM$OVET>IhSw_bJmZ zHV-f60X4gSQsg)014W}H?F8;KSnJ3;RhJ>ML~*J0yl1zFS!Y$718h1U8#wCQy-CU! zHtu!NBQC8)%Ta(E-{n=48GbzGG{1W_ArsL#T?YaQ>pD52h9zj5XRbu^UI%-Ru`w>Yi)G@%RJ0@=*Vk;Uru89}o4ndj>^dWp3Z&MBao%&b#+jH^0&l+%wXV(A~qE_W15@ z5WARiaO{MJYd+z@gIk#kE8soWV9A0DXV8(4VeSm%b(e*SI<4N=Cz2bt21_UaAAB|! zRRFTJ!ub?{4WC(Rz)mdbPJz#6G&#a{JZaz=jv?X66?+taZbCC&*-bccD^`W`R_`RX_d8;@( z!6`UtCD-@xK}(|zNbIl{9aP0S!6E8yZ|_Zu>(@0lvQ2G2EMOQg+r@JnAhxC7=;dj zS;8N+2)3-J<~&QG8S2Z?%)}5LW0xTB@1TDCm52i!wyyY0Vr zb+%J~j|2k>!@d>pU>IaV45tW72Dt={QRjG~Y)>*;}n-#bYrA_wn z^Q(Y>$Wil+cOx4YSG*qU#W`%NN_KKTT1XwLxQVOUYVjhxgd6N)O}ZjmW|^5Bsepk;<` zAc&xUaUD?>*85PFR-vix)+9^YVr!T4Q_6U6vC0ENI-zCY43Ah{gXpAKlj2I-K;%o< z1&%adTE7fULsXQ8nzhc@LAY^AoVfnI4NTti1ox}6i?F$({VF?P!@^9hfp>Oq3!uA9 z8;#r@oT%(|xzrdrD`XNa`Kq@C#YqIV*M@PGG%P=Ib8&?i^3B8C=gRI6GiqN32tc#$bMC;5xtYES1O3AE)vZ)9TJoBj`o_8-~FCO94wzWw1KYPYr+b1rUJcWElsW5D1xZu+1RO`R929&sG;~ zg*_ElOw2KGovpX$H=oT;Yymozc+$T&|64ga>+C<1`mD1t7(), std::string("a")); } + + void test_Issue492_stream_empty_row() + { + xlnt::streaming_workbook_reader wbr; + wbr.open(path_helper::test_file("Issue492_empty_row.xlsx")); + wbr.begin_worksheet("BLS Data Series"); + xlnt_assert(wbr.has_cell()); + xlnt_assert_equals(wbr.read_cell().reference(), "A1"); + xlnt_assert(wbr.has_cell()); + xlnt_assert_equals(wbr.read_cell().reference(), "A2"); + xlnt_assert(wbr.has_cell()); + xlnt_assert_equals(wbr.read_cell().reference(), "A4"); + xlnt_assert(wbr.has_cell()); + xlnt_assert_equals(wbr.read_cell().reference(), "B4"); + xlnt_assert(!wbr.has_cell()); + } }; static serialization_test_suite x;