diff --git a/include/xlnt/workbook/streaming_workbook_reader.hpp b/include/xlnt/workbook/streaming_workbook_reader.hpp index 6e975426..94bccb45 100644 --- a/include/xlnt/workbook/streaming_workbook_reader.hpp +++ b/include/xlnt/workbook/streaming_workbook_reader.hpp @@ -71,13 +71,13 @@ public: /// cell read_cell(); - bool has_worksheet(); + bool has_worksheet(const std::string &name); /// /// Beings reading of the next worksheet in the workbook and optionally /// returns its title if the last worksheet has not yet been read. /// - void begin_worksheet(); + void begin_worksheet(const std::string &name); /// /// Ends reading of the current worksheet in the workbook and optionally @@ -118,8 +118,13 @@ public: /// void open(std::istream &stream); + /// + /// Returns a vector of the titles of sheets in the workbook in order. + /// + std::vector sheet_titles(); + private: - std::vector worksheet_queue_; + std::string worksheet_rel_id_; std::unique_ptr consumer_; std::unique_ptr workbook_; std::unique_ptr stream_; diff --git a/include/xlnt/workbook/workbook.hpp b/include/xlnt/workbook/workbook.hpp index f879be08..fd2dcc57 100644 --- a/include/xlnt/workbook/workbook.hpp +++ b/include/xlnt/workbook/workbook.hpp @@ -63,6 +63,7 @@ class protection; class range; class range_reference; class relationship; +class streaming_workbook_reader; class style; class style_serializer; class theme; @@ -777,6 +778,7 @@ public: bool operator!=(const workbook &rhs) const; private: + friend class streaming_workbook_reader; friend class worksheet; friend class detail::xlsx_consumer; friend class detail::xlsx_producer; diff --git a/source/detail/serialization/xlsx_consumer.cpp b/source/detail/serialization/xlsx_consumer.cpp index 500303e3..cef9cddf 100644 --- a/source/detail/serialization/xlsx_consumer.cpp +++ b/source/detail/serialization/xlsx_consumer.cpp @@ -328,16 +328,6 @@ std::string xlsx_consumer::read_worksheet_begin(const std::string &rel_id) return p.second == rel_id; })->first; - auto id = sheet_title_id_map_[title]; - auto index = sheet_title_index_map_[title]; - - auto insertion_iter = target_.d_->worksheets_.begin(); - while (insertion_iter != target_.d_->worksheets_.end() && sheet_title_index_map_[insertion_iter->title_] < index) - { - ++insertion_iter; - } - - current_worksheet_ = &*target_.d_->worksheets_.emplace(insertion_iter, &target_, id, title); auto ws = worksheet(current_worksheet_); expect_start_element(qn("spreadsheetml", "worksheet"), xml::content::complex); // CT_Worksheet @@ -1572,14 +1562,29 @@ void xlsx_consumer::read_office_document(const std::string &content_type) // CT_ relationship_type::theme)}); } - if (streaming_) - { - return; - } - for (auto worksheet_rel : manifest().relationships(workbook_path, relationship_type::worksheet)) { - read_part({workbook_rel, worksheet_rel}); + auto title = std::find_if(target_.d_->sheet_title_rel_id_map_.begin(), + target_.d_->sheet_title_rel_id_map_.end(), + [&](const std::pair &p) { + return p.second == worksheet_rel.id(); + })->first; + + auto id = sheet_title_id_map_[title]; + auto index = sheet_title_index_map_[title]; + + auto insertion_iter = target_.d_->worksheets_.begin(); + while (insertion_iter != target_.d_->worksheets_.end() && sheet_title_index_map_[insertion_iter->title_] < index) + { + ++insertion_iter; + } + + current_worksheet_ = &*target_.d_->worksheets_.emplace(insertion_iter, &target_, id, title); + + if (!streaming_) + { + read_part({ workbook_rel, worksheet_rel }); + } } } diff --git a/source/workbook/streaming_workbook_reader.cpp b/source/workbook/streaming_workbook_reader.cpp index 83aa90a9..3d7965e5 100644 --- a/source/workbook/streaming_workbook_reader.cpp +++ b/source/workbook/streaming_workbook_reader.cpp @@ -23,6 +23,7 @@ #include +#include #include #include #include @@ -64,18 +65,24 @@ cell streaming_workbook_reader::read_cell() return consumer_->read_cell(); } -bool streaming_workbook_reader::has_worksheet() +bool streaming_workbook_reader::has_worksheet(const std::string &name) { - return !worksheet_queue_.empty(); + auto titles = sheet_titles(); + return std::find(titles.begin(), titles.end(), name) != titles.end(); } -void streaming_workbook_reader::begin_worksheet() +void streaming_workbook_reader::begin_worksheet(const std::string &title) { - const auto next_worksheet_rel = worksheet_queue_.back(); + if (!has_worksheet(title)) + { + throw xlnt::exception("sheet not found"); + } + + worksheet_rel_id_ = workbook_->impl().sheet_title_rel_id_map_.at(title); const auto workbook_rel = workbook_->manifest() .relationship(path("/"), relationship_type::office_document); const auto worksheet_rel = workbook_->manifest() - .relationship(workbook_rel.target().path(), next_worksheet_rel); + .relationship(workbook_rel.target().path(), worksheet_rel_id_); auto rel_chain = std::vector{ workbook_rel, worksheet_rel }; @@ -87,14 +94,27 @@ void streaming_workbook_reader::begin_worksheet() parser_.reset(new xml::parser(*part_stream_, part_path.string())); consumer_->parser_ = parser_.get(); - consumer_->read_worksheet_begin(next_worksheet_rel); + consumer_->current_worksheet_ = nullptr; + + for (auto &impl : workbook_->impl().worksheets_) + { + if (impl.title_ == title) + { + consumer_->current_worksheet_ = &impl; + } + } + + if (consumer_->current_worksheet_ == nullptr) + { + throw xlnt::exception("sheet not found"); + } + + consumer_->read_worksheet_begin(worksheet_rel_id_); } worksheet streaming_workbook_reader::end_worksheet() { - auto next_worksheet_rel = worksheet_queue_.back(); - worksheet_queue_.pop_back(); - return consumer_->read_worksheet_end(next_worksheet_rel); + return consumer_->read_worksheet_end(worksheet_rel_id_); } void streaming_workbook_reader::open(const std::vector &data) @@ -136,12 +156,11 @@ void streaming_workbook_reader::open(std::istream &stream) const auto workbook_rel = workbook_->manifest() .relationship(path("/"), relationship_type::office_document); const auto workbook_path = workbook_rel.target().path(); +} - for (auto worksheet_rel : workbook_->manifest() - .relationships(workbook_path, relationship_type::worksheet)) - { - worksheet_queue_.push_back(worksheet_rel.id()); - } +std::vector streaming_workbook_reader::sheet_titles() +{ + return workbook_->sheet_titles(); } } // namespace xlnt diff --git a/tests/workbook/serialization_test_suite.hpp b/tests/workbook/serialization_test_suite.hpp index edb3748e..abd6a642 100644 --- a/tests/workbook/serialization_test_suite.hpp +++ b/tests/workbook/serialization_test_suite.hpp @@ -473,14 +473,14 @@ public: reader.open(xlnt::path(path)); - while (reader.has_worksheet()) + for (auto sheet_name : reader.sheet_titles()) { - reader.begin_worksheet(); + reader.begin_worksheet(sheet_name); while (reader.has_cell()) { const auto cell = reader.read_cell(); - //std::cout << cell.reference().to_string() << std::endl; + std::cout << cell.reference().to_string() << " " << cell.to_string() << std::endl; } const auto ws = reader.end_worksheet(); diff --git a/xlntpyarrow/methods.cpp b/xlntpyarrow/methods.cpp index 428479b2..4b8fffe2 100644 --- a/xlntpyarrow/methods.cpp +++ b/xlntpyarrow/methods.cpp @@ -34,6 +34,7 @@ #include // must be included after Arrow #include +#include #include #include #include @@ -61,7 +62,7 @@ std::unique_ptr make_array_builder(xlnt::cell::type type) return std::unique_ptr(new arrow::Date32Builder(arrow::default_memory_pool())); } - default_case(std::unique_ptr(nullptr)); + default_case(std::unique_ptr(nullptrptr)); } arrow::Field make_type_field(const std::string &name, xlnt::cell::type type) @@ -82,7 +83,7 @@ arrow::Field make_type_field(const std::string &name, xlnt::cell::type type) return arrow::Field(name, arrow::date32()); } - default_case(arrow::Field("", arrow::null())); + default_case(arrow::Field("", arrow::nullptr())); } } // namespace xlnt @@ -114,29 +115,88 @@ extern "C" { PyObject *xlntpyarrow_xlsx2arrow(PyObject *self, PyObject *args, PyObject *kwargs) { - static const char *keywords[] = { "file", NULL }; + static const char *keywords[] = { "io", "sheetname", "header", "skiprows", + "skip_footer", "index_col", "names", "converters", "dtype", "true_values", + "false_values", "parse_cols", "squeeze", "na_values", "thousands", + "keep_default_na", "verbose", "convert_float", nullptr }; static auto keywords_nc = const_cast(keywords); - PyObject *file = NULL; + PyObject *io = nullptr; + PyObject *sheetname = nullptr; + PyObject *header = nullptr; + PyObject *skiprows = nullptr; + auto skip_footer = 0; + PyObject *index_col = nullptr; + PyObject *names = nullptr; + PyObject *converters = nullptr; + PyObject *dtype = nullptr; + PyObject *true_values = nullptr; + PyObject *false_values = nullptr; + PyObject *parse_cols = nullptr; + auto squeeze = false; + PyObject *na_values = nullptr; + const char *thousands = nullptr; + auto keep_default_va = false; + auto verbose = false; + auto convert_float = false; - if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O", keywords_nc, &file)) + std::cout << "here" << std::endl; + + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|OOOiOOOOOOOpOzppp", keywords_nc, + &io, &sheetname, &header, &skiprows, &skip_footer, &index_col, &names, + &converters, &dtype, &true_values, &false_values, &parse_cols, &squeeze, + &na_values, &thousands, &keep_default_va, &verbose, &convert_float)) { - return NULL; + PyErr_Print(); + PyErr_Clear(); + Py_RETURN_NONE; } + std::cout << "here2" << std::endl; + if (!import_pyarrow()) { Py_RETURN_NONE; } + std::cout << "here3" << std::endl; - xlnt::python_streambuf file_buffer(file); + // arg #1, io + xlnt::python_streambuf file_buffer(io); std::istream file_stream(&file_buffer); xlnt::streaming_workbook_reader reader; reader.open(file_stream); - reader.begin_worksheet(); + std::cout << "here4" << std::endl; + + // arg #2, sheetname + auto sheet_titles = reader.sheet_titles(); + auto sheet_title = sheet_titles.front(); + + std::cout << "here5 " << sheet_title << std::endl; + + if (sheetname != nullptr) + { + std::cout << "sheetname" << std::endl; + + if (PyLong_Check(sheetname)) + { + std::cout << "is long" << std::endl; + // handle int sheetname + auto sheet_index = PyLong_AsLong(sheetname); + sheet_title = sheet_titles.at(sheet_index); + } + else if (PyUnicode_Check(sheetname)) + { + std::cout << "is string" << std::endl; + // handle string sheetname + sheet_title = std::string(reinterpret_cast(PyUnicode_1BYTE_DATA(sheetname))); + } + } + + std::cout << sheet_title << std::endl; + reader.begin_worksheet(sheet_title); auto column_names = std::vector(); auto columns = std::vector>(); @@ -223,15 +283,15 @@ PyObject *xlntpyarrow_xlsx2arrow(PyObject *self, PyObject *args, PyObject *kwarg PyObject *xlntpyarrow_arrow2xlsx(PyObject *self, PyObject *args, PyObject *kwargs) { - static const char *keywords[] = { "table", "file", NULL }; + static const char *keywords[] = { "table", "file", nullptr }; static auto keywords_nc = const_cast(keywords); - PyObject *table = NULL; - PyObject *file = NULL; + PyObject *table = nullptr; + PyObject *file = nullptr; if (!PyArg_ParseTupleAndKeywords(args, kwargs, "OO", keywords_nc, &table, &file)) { - return NULL; + return nullptr; } if (!import_pyarrow())