start implementing xlsx2arrow params

This commit is contained in:
Thomas Fussell 2017-07-19 17:21:55 -07:00
parent 8965cfa82c
commit eaaa310cef
6 changed files with 139 additions and 48 deletions

View File

@ -71,13 +71,13 @@ public:
/// </summary>
cell read_cell();
bool has_worksheet();
bool has_worksheet(const std::string &name);
/// <summary>
/// Beings reading of the next worksheet in the workbook and optionally
/// returns its title if the last worksheet has not yet been read.
/// </summary>
void begin_worksheet();
void begin_worksheet(const std::string &name);
/// <summary>
/// Ends reading of the current worksheet in the workbook and optionally
@ -118,8 +118,13 @@ public:
/// </summary>
void open(std::istream &stream);
/// <summary>
/// Returns a vector of the titles of sheets in the workbook in order.
/// </summary>
std::vector<std::string> sheet_titles();
private:
std::vector<std::string> worksheet_queue_;
std::string worksheet_rel_id_;
std::unique_ptr<detail::xlsx_consumer> consumer_;
std::unique_ptr<workbook> workbook_;
std::unique_ptr<std::istream> stream_;

View File

@ -63,6 +63,7 @@ class protection;
class range;
class range_reference;
class relationship;
class streaming_workbook_reader;
class style;
class style_serializer;
class theme;
@ -777,6 +778,7 @@ public:
bool operator!=(const workbook &rhs) const;
private:
friend class streaming_workbook_reader;
friend class worksheet;
friend class detail::xlsx_consumer;
friend class detail::xlsx_producer;

View File

@ -328,16 +328,6 @@ std::string xlsx_consumer::read_worksheet_begin(const std::string &rel_id)
return p.second == rel_id;
})->first;
auto id = sheet_title_id_map_[title];
auto index = sheet_title_index_map_[title];
auto insertion_iter = target_.d_->worksheets_.begin();
while (insertion_iter != target_.d_->worksheets_.end() && sheet_title_index_map_[insertion_iter->title_] < index)
{
++insertion_iter;
}
current_worksheet_ = &*target_.d_->worksheets_.emplace(insertion_iter, &target_, id, title);
auto ws = worksheet(current_worksheet_);
expect_start_element(qn("spreadsheetml", "worksheet"), xml::content::complex); // CT_Worksheet
@ -1572,14 +1562,29 @@ void xlsx_consumer::read_office_document(const std::string &content_type) // CT_
relationship_type::theme)});
}
if (streaming_)
{
return;
}
for (auto worksheet_rel : manifest().relationships(workbook_path, relationship_type::worksheet))
{
read_part({workbook_rel, worksheet_rel});
auto title = std::find_if(target_.d_->sheet_title_rel_id_map_.begin(),
target_.d_->sheet_title_rel_id_map_.end(),
[&](const std::pair<std::string, std::string> &p) {
return p.second == worksheet_rel.id();
})->first;
auto id = sheet_title_id_map_[title];
auto index = sheet_title_index_map_[title];
auto insertion_iter = target_.d_->worksheets_.begin();
while (insertion_iter != target_.d_->worksheets_.end() && sheet_title_index_map_[insertion_iter->title_] < index)
{
++insertion_iter;
}
current_worksheet_ = &*target_.d_->worksheets_.emplace(insertion_iter, &target_, id, title);
if (!streaming_)
{
read_part({ workbook_rel, worksheet_rel });
}
}
}

View File

@ -23,6 +23,7 @@
#include <fstream>
#include <detail/implementations/workbook_impl.hpp>
#include <detail/serialization/open_stream.hpp>
#include <detail/serialization/vector_streambuf.hpp>
#include <detail/serialization/xlsx_consumer.hpp>
@ -64,18 +65,24 @@ cell streaming_workbook_reader::read_cell()
return consumer_->read_cell();
}
bool streaming_workbook_reader::has_worksheet()
bool streaming_workbook_reader::has_worksheet(const std::string &name)
{
return !worksheet_queue_.empty();
auto titles = sheet_titles();
return std::find(titles.begin(), titles.end(), name) != titles.end();
}
void streaming_workbook_reader::begin_worksheet()
void streaming_workbook_reader::begin_worksheet(const std::string &title)
{
const auto next_worksheet_rel = worksheet_queue_.back();
if (!has_worksheet(title))
{
throw xlnt::exception("sheet not found");
}
worksheet_rel_id_ = workbook_->impl().sheet_title_rel_id_map_.at(title);
const auto workbook_rel = workbook_->manifest()
.relationship(path("/"), relationship_type::office_document);
const auto worksheet_rel = workbook_->manifest()
.relationship(workbook_rel.target().path(), next_worksheet_rel);
.relationship(workbook_rel.target().path(), worksheet_rel_id_);
auto rel_chain = std::vector<relationship>{ workbook_rel, worksheet_rel };
@ -87,14 +94,27 @@ void streaming_workbook_reader::begin_worksheet()
parser_.reset(new xml::parser(*part_stream_, part_path.string()));
consumer_->parser_ = parser_.get();
consumer_->read_worksheet_begin(next_worksheet_rel);
consumer_->current_worksheet_ = nullptr;
for (auto &impl : workbook_->impl().worksheets_)
{
if (impl.title_ == title)
{
consumer_->current_worksheet_ = &impl;
}
}
if (consumer_->current_worksheet_ == nullptr)
{
throw xlnt::exception("sheet not found");
}
consumer_->read_worksheet_begin(worksheet_rel_id_);
}
worksheet streaming_workbook_reader::end_worksheet()
{
auto next_worksheet_rel = worksheet_queue_.back();
worksheet_queue_.pop_back();
return consumer_->read_worksheet_end(next_worksheet_rel);
return consumer_->read_worksheet_end(worksheet_rel_id_);
}
void streaming_workbook_reader::open(const std::vector<std::uint8_t> &data)
@ -136,12 +156,11 @@ void streaming_workbook_reader::open(std::istream &stream)
const auto workbook_rel = workbook_->manifest()
.relationship(path("/"), relationship_type::office_document);
const auto workbook_path = workbook_rel.target().path();
}
for (auto worksheet_rel : workbook_->manifest()
.relationships(workbook_path, relationship_type::worksheet))
{
worksheet_queue_.push_back(worksheet_rel.id());
}
std::vector<std::string> streaming_workbook_reader::sheet_titles()
{
return workbook_->sheet_titles();
}
} // namespace xlnt

View File

@ -473,14 +473,14 @@ public:
reader.open(xlnt::path(path));
while (reader.has_worksheet())
for (auto sheet_name : reader.sheet_titles())
{
reader.begin_worksheet();
reader.begin_worksheet(sheet_name);
while (reader.has_cell())
{
const auto cell = reader.read_cell();
//std::cout << cell.reference().to_string() << std::endl;
std::cout << cell.reference().to_string() << " " << cell.to_string() << std::endl;
}
const auto ws = reader.end_worksheet();

View File

@ -34,6 +34,7 @@
#include <Python.h> // must be included after Arrow
#include <detail/default_case.hpp>
#include <detail/unicode.hpp>
#include <python_streambuf.hpp>
#include <xlnt/cell/cell.hpp>
#include <xlnt/cell/cell_reference.hpp>
@ -61,7 +62,7 @@ std::unique_ptr<arrow::ArrayBuilder> make_array_builder(xlnt::cell::type type)
return std::unique_ptr<arrow::Date32Builder>(new arrow::Date32Builder(arrow::default_memory_pool()));
}
default_case(std::unique_ptr<arrow::ArrayBuilder>(nullptr));
default_case(std::unique_ptr<arrow::ArrayBuilder>(nullptrptr));
}
arrow::Field make_type_field(const std::string &name, xlnt::cell::type type)
@ -82,7 +83,7 @@ arrow::Field make_type_field(const std::string &name, xlnt::cell::type type)
return arrow::Field(name, arrow::date32());
}
default_case(arrow::Field("", arrow::null()));
default_case(arrow::Field("", arrow::nullptr()));
}
} // namespace xlnt
@ -114,29 +115,88 @@ extern "C" {
PyObject *xlntpyarrow_xlsx2arrow(PyObject *self, PyObject *args, PyObject *kwargs)
{
static const char *keywords[] = { "file", NULL };
static const char *keywords[] = { "io", "sheetname", "header", "skiprows",
"skip_footer", "index_col", "names", "converters", "dtype", "true_values",
"false_values", "parse_cols", "squeeze", "na_values", "thousands",
"keep_default_na", "verbose", "convert_float", nullptr };
static auto keywords_nc = const_cast<char **>(keywords);
PyObject *file = NULL;
PyObject *io = nullptr;
PyObject *sheetname = nullptr;
PyObject *header = nullptr;
PyObject *skiprows = nullptr;
auto skip_footer = 0;
PyObject *index_col = nullptr;
PyObject *names = nullptr;
PyObject *converters = nullptr;
PyObject *dtype = nullptr;
PyObject *true_values = nullptr;
PyObject *false_values = nullptr;
PyObject *parse_cols = nullptr;
auto squeeze = false;
PyObject *na_values = nullptr;
const char *thousands = nullptr;
auto keep_default_va = false;
auto verbose = false;
auto convert_float = false;
if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O", keywords_nc, &file))
std::cout << "here" << std::endl;
if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|OOOiOOOOOOOpOzppp", keywords_nc,
&io, &sheetname, &header, &skiprows, &skip_footer, &index_col, &names,
&converters, &dtype, &true_values, &false_values, &parse_cols, &squeeze,
&na_values, &thousands, &keep_default_va, &verbose, &convert_float))
{
return NULL;
PyErr_Print();
PyErr_Clear();
Py_RETURN_NONE;
}
std::cout << "here2" << std::endl;
if (!import_pyarrow())
{
Py_RETURN_NONE;
}
std::cout << "here3" << std::endl;
xlnt::python_streambuf file_buffer(file);
// arg #1, io
xlnt::python_streambuf file_buffer(io);
std::istream file_stream(&file_buffer);
xlnt::streaming_workbook_reader reader;
reader.open(file_stream);
reader.begin_worksheet();
std::cout << "here4" << std::endl;
// arg #2, sheetname
auto sheet_titles = reader.sheet_titles();
auto sheet_title = sheet_titles.front();
std::cout << "here5 " << sheet_title << std::endl;
if (sheetname != nullptr)
{
std::cout << "sheetname" << std::endl;
if (PyLong_Check(sheetname))
{
std::cout << "is long" << std::endl;
// handle int sheetname
auto sheet_index = PyLong_AsLong(sheetname);
sheet_title = sheet_titles.at(sheet_index);
}
else if (PyUnicode_Check(sheetname))
{
std::cout << "is string" << std::endl;
// handle string sheetname
sheet_title = std::string(reinterpret_cast<char *>(PyUnicode_1BYTE_DATA(sheetname)));
}
}
std::cout << sheet_title << std::endl;
reader.begin_worksheet(sheet_title);
auto column_names = std::vector<std::string>();
auto columns = std::vector<std::unique_ptr<arrow::ArrayBuilder>>();
@ -223,15 +283,15 @@ PyObject *xlntpyarrow_xlsx2arrow(PyObject *self, PyObject *args, PyObject *kwarg
PyObject *xlntpyarrow_arrow2xlsx(PyObject *self, PyObject *args, PyObject *kwargs)
{
static const char *keywords[] = { "table", "file", NULL };
static const char *keywords[] = { "table", "file", nullptr };
static auto keywords_nc = const_cast<char **>(keywords);
PyObject *table = NULL;
PyObject *file = NULL;
PyObject *table = nullptr;
PyObject *file = nullptr;
if (!PyArg_ParseTupleAndKeywords(args, kwargs, "OO", keywords_nc, &table, &file))
{
return NULL;
return nullptr;
}
if (!import_pyarrow())