From 33399a5390c93af9f47d536d5186e0fcef1b1217 Mon Sep 17 00:00:00 2001 From: Thomas Fussell Date: Sat, 15 Jul 2017 10:39:36 -0700 Subject: [PATCH] now we're getting somewhere! xlntpyarrow.xlsx2arrow returns an Arrow table that can be converted to a pandas DataFrame now --- include/xlnt/utils/xlntarrow.hpp | 2 +- source/utils/xlntarrow.cpp | 43 ++++++++++++++++++++- xlntpyarrow/setup.py.cmake | 1 + xlntpyarrow/xlntpyarrow.cpp | 65 ++++++++++++++++++++++++++++---- 4 files changed, 102 insertions(+), 9 deletions(-) diff --git a/include/xlnt/utils/xlntarrow.hpp b/include/xlnt/utils/xlntarrow.hpp index 87492595..2dbed739 100644 --- a/include/xlnt/utils/xlntarrow.hpp +++ b/include/xlnt/utils/xlntarrow.hpp @@ -10,6 +10,6 @@ class Table; namespace xlnt { std::shared_ptr XLNT_API xlsx2arrow(std::istream &s); -void XLNT_API arrow2xlsx(std::shared_ptr &table, std::ostream &s); +void XLNT_API arrow2xlsx(std::shared_ptr &table, std::ostream &s); } // namespace xlnt diff --git a/source/utils/xlntarrow.cpp b/source/utils/xlntarrow.cpp index 679fa7be..b7a49902 100644 --- a/source/utils/xlntarrow.cpp +++ b/source/utils/xlntarrow.cpp @@ -26,6 +26,7 @@ #include #pragma warning(pop) +#include #include #include #include @@ -52,6 +53,8 @@ std::unique_ptr make_array_builder(xlnt::cell::type type) case xlnt::cell::type::date: return std::unique_ptr(new arrow::Date32Builder(arrow::default_memory_pool())); } + + default_case(std::unique_ptr(nullptr)); } arrow::Field make_type_field(const std::string &name, xlnt::cell::type type) @@ -71,6 +74,8 @@ arrow::Field make_type_field(const std::string &name, xlnt::cell::type type) case xlnt::cell::type::date: return arrow::Field(name, arrow::date32()); } + + default_case(arrow::Field("", arrow::null())); } } // namespace @@ -103,6 +108,7 @@ std::shared_ptr XLNT_API xlsx2arrow(std::istream &s) if (cell.row() == 1) { column_names.push_back(cell.value()); + continue; } else if (cell.row() == 2) { @@ -111,6 +117,41 @@ std::shared_ptr XLNT_API xlsx2arrow(std::istream &s) fields.push_back(std::make_shared(field)); columns.push_back(make_array_builder(cell.data_type())); } + + auto builder = columns.at(cell.column().index - 1).get(); + + switch (cell.data_type()) + { + case xlnt::cell::type::number: + { + auto typed_builder = static_cast(builder); + typed_builder->Append(0); + break; + } + case xlnt::cell::type::inline_string: + case xlnt::cell::type::shared_string: + case xlnt::cell::type::error: + case xlnt::cell::type::formula_string: + case xlnt::cell::type::empty: + { + auto typed_builder = static_cast(builder); + typed_builder->Append(cell.value()); + break; + } + case xlnt::cell::type::boolean: + { + auto typed_builder = static_cast(builder); + typed_builder->Append(cell.value()); + break; + } + case xlnt::cell::type::date: + { + auto typed_builder = static_cast(builder); + typed_builder->Append(cell.value()); + break; + } + } + } reader.end_worksheet(); @@ -131,7 +172,7 @@ std::shared_ptr XLNT_API xlsx2arrow(std::istream &s) return table; } -void XLNT_API arrow2xlsx(std::shared_ptr &table, std::ostream &s) +void XLNT_API arrow2xlsx(std::shared_ptr &table, std::ostream &s) { xlnt::streaming_workbook_writer writer; writer.open(s); diff --git a/xlntpyarrow/setup.py.cmake b/xlntpyarrow/setup.py.cmake index a53b9542..eb74f5a3 100644 --- a/xlntpyarrow/setup.py.cmake +++ b/xlntpyarrow/setup.py.cmake @@ -42,6 +42,7 @@ xlntpyarrow_extension = Extension( include_dirs = include_dirs, libraries = [ 'arrow', + 'arrow_python', 'xlnt' ], library_dirs = library_dirs, diff --git a/xlntpyarrow/xlntpyarrow.cpp b/xlntpyarrow/xlntpyarrow.cpp index 5f025afa..1d850b09 100644 --- a/xlntpyarrow/xlntpyarrow.cpp +++ b/xlntpyarrow/xlntpyarrow.cpp @@ -3,17 +3,65 @@ #include #include +#include #include // must be included after Arrow #include #include -PyObject *xlsx2arrow(PyObject *file) +bool import_pyarrow() { - xlnt::python_streambuf buffer(file); + static bool imported = false; + + if (!imported) + { + if (!arrow::py::import_pyarrow()) + { + if (PyErr_Occurred() != nullptr) + { + PyErr_Print(); + PyErr_Clear(); + } + } + else + { + imported = true; + } + } + + return imported; +} + +PyObject *xlsx2arrow(PyObject *pyfile) +{ + if (!import_pyarrow()) + { + Py_RETURN_NONE; + } + + xlnt::python_streambuf buffer(pyfile); std::istream stream(&buffer); auto table = xlnt::xlsx2arrow(stream); + return arrow::py::wrap_table(table); +} + +PyObject *arrow2xlsx(PyObject *pytable, PyObject *pyfile) +{ + if (!import_pyarrow()) + { + Py_RETURN_NONE; + } + + (void)pytable; + (void)pyfile; + /* + auto table = arrow::py::unwrap_table(pytable); + xlnt::python_streambuf buffer(pyfile); + std::ostream stream(&buffer); + xlnt::arrow2xlsx(table, stream); + */ + Py_RETURN_NONE; } @@ -28,10 +76,11 @@ Returns an arrow table representing the given XLSX file object."); PyObject *xlntpyarrow_xlsx2arrow(PyObject *self, PyObject *args, PyObject *kwargs) { - PyObject *file = NULL; static const char *keywords[] = { "file", NULL }; static auto keywords_nc = const_cast(keywords); + PyObject *file = NULL; + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O", keywords_nc, &file)) { return NULL; @@ -50,16 +99,18 @@ Writes the given arrow table to out_file as an XLSX file."); PyObject *xlntpyarrow_arrow2xlsx(PyObject *self, PyObject *args, PyObject *kwargs) { - PyObject *obj = NULL; - static const char *keywords[] = { "file", NULL }; + static const char *keywords[] = { "table", "file", NULL }; static auto keywords_nc = const_cast(keywords); - if (!PyArg_ParseTupleAndKeywords(args, kwargs, "Oi", keywords_nc, &obj)) + PyObject *table = NULL; + PyObject *file = NULL; + + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "OO", keywords_nc, &table, &file)) { return NULL; } - Py_RETURN_NONE; + return arrow2xlsx(table, file); } // 2.7/3 compatible based on https://docs.python.org/3/howto/cporting.html