mirror of
https://github.com/tfussell/xlnt.git
synced 2024-03-22 13:11:17 +08:00
start implementing xlsx2arrow params
This commit is contained in:
parent
8965cfa82c
commit
eaaa310cef
|
@ -71,13 +71,13 @@ public:
|
|||
/// </summary>
|
||||
cell read_cell();
|
||||
|
||||
bool has_worksheet();
|
||||
bool has_worksheet(const std::string &name);
|
||||
|
||||
/// <summary>
|
||||
/// Beings reading of the next worksheet in the workbook and optionally
|
||||
/// returns its title if the last worksheet has not yet been read.
|
||||
/// </summary>
|
||||
void begin_worksheet();
|
||||
void begin_worksheet(const std::string &name);
|
||||
|
||||
/// <summary>
|
||||
/// Ends reading of the current worksheet in the workbook and optionally
|
||||
|
@ -118,8 +118,13 @@ public:
|
|||
/// </summary>
|
||||
void open(std::istream &stream);
|
||||
|
||||
/// <summary>
|
||||
/// Returns a vector of the titles of sheets in the workbook in order.
|
||||
/// </summary>
|
||||
std::vector<std::string> sheet_titles();
|
||||
|
||||
private:
|
||||
std::vector<std::string> worksheet_queue_;
|
||||
std::string worksheet_rel_id_;
|
||||
std::unique_ptr<detail::xlsx_consumer> consumer_;
|
||||
std::unique_ptr<workbook> workbook_;
|
||||
std::unique_ptr<std::istream> stream_;
|
||||
|
|
|
@ -63,6 +63,7 @@ class protection;
|
|||
class range;
|
||||
class range_reference;
|
||||
class relationship;
|
||||
class streaming_workbook_reader;
|
||||
class style;
|
||||
class style_serializer;
|
||||
class theme;
|
||||
|
@ -777,6 +778,7 @@ public:
|
|||
bool operator!=(const workbook &rhs) const;
|
||||
|
||||
private:
|
||||
friend class streaming_workbook_reader;
|
||||
friend class worksheet;
|
||||
friend class detail::xlsx_consumer;
|
||||
friend class detail::xlsx_producer;
|
||||
|
|
|
@ -328,16 +328,6 @@ std::string xlsx_consumer::read_worksheet_begin(const std::string &rel_id)
|
|||
return p.second == rel_id;
|
||||
})->first;
|
||||
|
||||
auto id = sheet_title_id_map_[title];
|
||||
auto index = sheet_title_index_map_[title];
|
||||
|
||||
auto insertion_iter = target_.d_->worksheets_.begin();
|
||||
while (insertion_iter != target_.d_->worksheets_.end() && sheet_title_index_map_[insertion_iter->title_] < index)
|
||||
{
|
||||
++insertion_iter;
|
||||
}
|
||||
|
||||
current_worksheet_ = &*target_.d_->worksheets_.emplace(insertion_iter, &target_, id, title);
|
||||
auto ws = worksheet(current_worksheet_);
|
||||
|
||||
expect_start_element(qn("spreadsheetml", "worksheet"), xml::content::complex); // CT_Worksheet
|
||||
|
@ -1572,14 +1562,29 @@ void xlsx_consumer::read_office_document(const std::string &content_type) // CT_
|
|||
relationship_type::theme)});
|
||||
}
|
||||
|
||||
if (streaming_)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
for (auto worksheet_rel : manifest().relationships(workbook_path, relationship_type::worksheet))
|
||||
{
|
||||
read_part({workbook_rel, worksheet_rel});
|
||||
auto title = std::find_if(target_.d_->sheet_title_rel_id_map_.begin(),
|
||||
target_.d_->sheet_title_rel_id_map_.end(),
|
||||
[&](const std::pair<std::string, std::string> &p) {
|
||||
return p.second == worksheet_rel.id();
|
||||
})->first;
|
||||
|
||||
auto id = sheet_title_id_map_[title];
|
||||
auto index = sheet_title_index_map_[title];
|
||||
|
||||
auto insertion_iter = target_.d_->worksheets_.begin();
|
||||
while (insertion_iter != target_.d_->worksheets_.end() && sheet_title_index_map_[insertion_iter->title_] < index)
|
||||
{
|
||||
++insertion_iter;
|
||||
}
|
||||
|
||||
current_worksheet_ = &*target_.d_->worksheets_.emplace(insertion_iter, &target_, id, title);
|
||||
|
||||
if (!streaming_)
|
||||
{
|
||||
read_part({ workbook_rel, worksheet_rel });
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -23,6 +23,7 @@
|
|||
|
||||
#include <fstream>
|
||||
|
||||
#include <detail/implementations/workbook_impl.hpp>
|
||||
#include <detail/serialization/open_stream.hpp>
|
||||
#include <detail/serialization/vector_streambuf.hpp>
|
||||
#include <detail/serialization/xlsx_consumer.hpp>
|
||||
|
@ -64,18 +65,24 @@ cell streaming_workbook_reader::read_cell()
|
|||
return consumer_->read_cell();
|
||||
}
|
||||
|
||||
bool streaming_workbook_reader::has_worksheet()
|
||||
bool streaming_workbook_reader::has_worksheet(const std::string &name)
|
||||
{
|
||||
return !worksheet_queue_.empty();
|
||||
auto titles = sheet_titles();
|
||||
return std::find(titles.begin(), titles.end(), name) != titles.end();
|
||||
}
|
||||
|
||||
void streaming_workbook_reader::begin_worksheet()
|
||||
void streaming_workbook_reader::begin_worksheet(const std::string &title)
|
||||
{
|
||||
const auto next_worksheet_rel = worksheet_queue_.back();
|
||||
if (!has_worksheet(title))
|
||||
{
|
||||
throw xlnt::exception("sheet not found");
|
||||
}
|
||||
|
||||
worksheet_rel_id_ = workbook_->impl().sheet_title_rel_id_map_.at(title);
|
||||
const auto workbook_rel = workbook_->manifest()
|
||||
.relationship(path("/"), relationship_type::office_document);
|
||||
const auto worksheet_rel = workbook_->manifest()
|
||||
.relationship(workbook_rel.target().path(), next_worksheet_rel);
|
||||
.relationship(workbook_rel.target().path(), worksheet_rel_id_);
|
||||
|
||||
auto rel_chain = std::vector<relationship>{ workbook_rel, worksheet_rel };
|
||||
|
||||
|
@ -87,14 +94,27 @@ void streaming_workbook_reader::begin_worksheet()
|
|||
parser_.reset(new xml::parser(*part_stream_, part_path.string()));
|
||||
consumer_->parser_ = parser_.get();
|
||||
|
||||
consumer_->read_worksheet_begin(next_worksheet_rel);
|
||||
consumer_->current_worksheet_ = nullptr;
|
||||
|
||||
for (auto &impl : workbook_->impl().worksheets_)
|
||||
{
|
||||
if (impl.title_ == title)
|
||||
{
|
||||
consumer_->current_worksheet_ = &impl;
|
||||
}
|
||||
}
|
||||
|
||||
if (consumer_->current_worksheet_ == nullptr)
|
||||
{
|
||||
throw xlnt::exception("sheet not found");
|
||||
}
|
||||
|
||||
consumer_->read_worksheet_begin(worksheet_rel_id_);
|
||||
}
|
||||
|
||||
worksheet streaming_workbook_reader::end_worksheet()
|
||||
{
|
||||
auto next_worksheet_rel = worksheet_queue_.back();
|
||||
worksheet_queue_.pop_back();
|
||||
return consumer_->read_worksheet_end(next_worksheet_rel);
|
||||
return consumer_->read_worksheet_end(worksheet_rel_id_);
|
||||
}
|
||||
|
||||
void streaming_workbook_reader::open(const std::vector<std::uint8_t> &data)
|
||||
|
@ -136,12 +156,11 @@ void streaming_workbook_reader::open(std::istream &stream)
|
|||
const auto workbook_rel = workbook_->manifest()
|
||||
.relationship(path("/"), relationship_type::office_document);
|
||||
const auto workbook_path = workbook_rel.target().path();
|
||||
}
|
||||
|
||||
for (auto worksheet_rel : workbook_->manifest()
|
||||
.relationships(workbook_path, relationship_type::worksheet))
|
||||
{
|
||||
worksheet_queue_.push_back(worksheet_rel.id());
|
||||
}
|
||||
std::vector<std::string> streaming_workbook_reader::sheet_titles()
|
||||
{
|
||||
return workbook_->sheet_titles();
|
||||
}
|
||||
|
||||
} // namespace xlnt
|
||||
|
|
|
@ -473,14 +473,14 @@ public:
|
|||
|
||||
reader.open(xlnt::path(path));
|
||||
|
||||
while (reader.has_worksheet())
|
||||
for (auto sheet_name : reader.sheet_titles())
|
||||
{
|
||||
reader.begin_worksheet();
|
||||
reader.begin_worksheet(sheet_name);
|
||||
|
||||
while (reader.has_cell())
|
||||
{
|
||||
const auto cell = reader.read_cell();
|
||||
//std::cout << cell.reference().to_string() << std::endl;
|
||||
std::cout << cell.reference().to_string() << " " << cell.to_string() << std::endl;
|
||||
}
|
||||
|
||||
const auto ws = reader.end_worksheet();
|
||||
|
|
|
@ -34,6 +34,7 @@
|
|||
#include <Python.h> // must be included after Arrow
|
||||
|
||||
#include <detail/default_case.hpp>
|
||||
#include <detail/unicode.hpp>
|
||||
#include <python_streambuf.hpp>
|
||||
#include <xlnt/cell/cell.hpp>
|
||||
#include <xlnt/cell/cell_reference.hpp>
|
||||
|
@ -61,7 +62,7 @@ std::unique_ptr<arrow::ArrayBuilder> make_array_builder(xlnt::cell::type type)
|
|||
return std::unique_ptr<arrow::Date32Builder>(new arrow::Date32Builder(arrow::default_memory_pool()));
|
||||
}
|
||||
|
||||
default_case(std::unique_ptr<arrow::ArrayBuilder>(nullptr));
|
||||
default_case(std::unique_ptr<arrow::ArrayBuilder>(nullptrptr));
|
||||
}
|
||||
|
||||
arrow::Field make_type_field(const std::string &name, xlnt::cell::type type)
|
||||
|
@ -82,7 +83,7 @@ arrow::Field make_type_field(const std::string &name, xlnt::cell::type type)
|
|||
return arrow::Field(name, arrow::date32());
|
||||
}
|
||||
|
||||
default_case(arrow::Field("", arrow::null()));
|
||||
default_case(arrow::Field("", arrow::nullptr()));
|
||||
}
|
||||
|
||||
} // namespace xlnt
|
||||
|
@ -114,29 +115,88 @@ extern "C" {
|
|||
|
||||
PyObject *xlntpyarrow_xlsx2arrow(PyObject *self, PyObject *args, PyObject *kwargs)
|
||||
{
|
||||
static const char *keywords[] = { "file", NULL };
|
||||
static const char *keywords[] = { "io", "sheetname", "header", "skiprows",
|
||||
"skip_footer", "index_col", "names", "converters", "dtype", "true_values",
|
||||
"false_values", "parse_cols", "squeeze", "na_values", "thousands",
|
||||
"keep_default_na", "verbose", "convert_float", nullptr };
|
||||
static auto keywords_nc = const_cast<char **>(keywords);
|
||||
|
||||
PyObject *file = NULL;
|
||||
PyObject *io = nullptr;
|
||||
PyObject *sheetname = nullptr;
|
||||
PyObject *header = nullptr;
|
||||
PyObject *skiprows = nullptr;
|
||||
auto skip_footer = 0;
|
||||
PyObject *index_col = nullptr;
|
||||
PyObject *names = nullptr;
|
||||
PyObject *converters = nullptr;
|
||||
PyObject *dtype = nullptr;
|
||||
PyObject *true_values = nullptr;
|
||||
PyObject *false_values = nullptr;
|
||||
PyObject *parse_cols = nullptr;
|
||||
auto squeeze = false;
|
||||
PyObject *na_values = nullptr;
|
||||
const char *thousands = nullptr;
|
||||
auto keep_default_va = false;
|
||||
auto verbose = false;
|
||||
auto convert_float = false;
|
||||
|
||||
if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O", keywords_nc, &file))
|
||||
std::cout << "here" << std::endl;
|
||||
|
||||
if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|OOOiOOOOOOOpOzppp", keywords_nc,
|
||||
&io, &sheetname, &header, &skiprows, &skip_footer, &index_col, &names,
|
||||
&converters, &dtype, &true_values, &false_values, &parse_cols, &squeeze,
|
||||
&na_values, &thousands, &keep_default_va, &verbose, &convert_float))
|
||||
{
|
||||
return NULL;
|
||||
PyErr_Print();
|
||||
PyErr_Clear();
|
||||
Py_RETURN_NONE;
|
||||
}
|
||||
|
||||
std::cout << "here2" << std::endl;
|
||||
|
||||
if (!import_pyarrow())
|
||||
{
|
||||
Py_RETURN_NONE;
|
||||
}
|
||||
|
||||
std::cout << "here3" << std::endl;
|
||||
|
||||
xlnt::python_streambuf file_buffer(file);
|
||||
// arg #1, io
|
||||
xlnt::python_streambuf file_buffer(io);
|
||||
std::istream file_stream(&file_buffer);
|
||||
|
||||
xlnt::streaming_workbook_reader reader;
|
||||
reader.open(file_stream);
|
||||
|
||||
reader.begin_worksheet();
|
||||
std::cout << "here4" << std::endl;
|
||||
|
||||
// arg #2, sheetname
|
||||
auto sheet_titles = reader.sheet_titles();
|
||||
auto sheet_title = sheet_titles.front();
|
||||
|
||||
std::cout << "here5 " << sheet_title << std::endl;
|
||||
|
||||
if (sheetname != nullptr)
|
||||
{
|
||||
std::cout << "sheetname" << std::endl;
|
||||
|
||||
if (PyLong_Check(sheetname))
|
||||
{
|
||||
std::cout << "is long" << std::endl;
|
||||
// handle int sheetname
|
||||
auto sheet_index = PyLong_AsLong(sheetname);
|
||||
sheet_title = sheet_titles.at(sheet_index);
|
||||
}
|
||||
else if (PyUnicode_Check(sheetname))
|
||||
{
|
||||
std::cout << "is string" << std::endl;
|
||||
// handle string sheetname
|
||||
sheet_title = std::string(reinterpret_cast<char *>(PyUnicode_1BYTE_DATA(sheetname)));
|
||||
}
|
||||
}
|
||||
|
||||
std::cout << sheet_title << std::endl;
|
||||
reader.begin_worksheet(sheet_title);
|
||||
|
||||
auto column_names = std::vector<std::string>();
|
||||
auto columns = std::vector<std::unique_ptr<arrow::ArrayBuilder>>();
|
||||
|
@ -223,15 +283,15 @@ PyObject *xlntpyarrow_xlsx2arrow(PyObject *self, PyObject *args, PyObject *kwarg
|
|||
|
||||
PyObject *xlntpyarrow_arrow2xlsx(PyObject *self, PyObject *args, PyObject *kwargs)
|
||||
{
|
||||
static const char *keywords[] = { "table", "file", NULL };
|
||||
static const char *keywords[] = { "table", "file", nullptr };
|
||||
static auto keywords_nc = const_cast<char **>(keywords);
|
||||
|
||||
PyObject *table = NULL;
|
||||
PyObject *file = NULL;
|
||||
PyObject *table = nullptr;
|
||||
PyObject *file = nullptr;
|
||||
|
||||
if (!PyArg_ParseTupleAndKeywords(args, kwargs, "OO", keywords_nc, &table, &file))
|
||||
{
|
||||
return NULL;
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
if (!import_pyarrow())
|
||||
|
|
Loading…
Reference in New Issue
Block a user