now we're getting somewhere! xlntpyarrow.xlsx2arrow returns an Arrow table that can be converted to a pandas DataFrame now

This commit is contained in:
Thomas Fussell 2017-07-15 10:39:36 -07:00
parent 5b95b3d463
commit 33399a5390
4 changed files with 102 additions and 9 deletions

View File

@ -10,6 +10,6 @@ class Table;
namespace xlnt { namespace xlnt {
std::shared_ptr<arrow::Table> XLNT_API xlsx2arrow(std::istream &s); std::shared_ptr<arrow::Table> XLNT_API xlsx2arrow(std::istream &s);
void XLNT_API arrow2xlsx(std::shared_ptr<const arrow::Table> &table, std::ostream &s); void XLNT_API arrow2xlsx(std::shared_ptr<arrow::Table> &table, std::ostream &s);
} // namespace xlnt } // namespace xlnt

View File

@ -26,6 +26,7 @@
#include <arrow/api.h> #include <arrow/api.h>
#pragma warning(pop) #pragma warning(pop)
#include <detail/default_case.hpp>
#include <xlnt/cell/cell.hpp> #include <xlnt/cell/cell.hpp>
#include <xlnt/cell/cell_reference.hpp> #include <xlnt/cell/cell_reference.hpp>
#include <xlnt/utils/xlntarrow.hpp> #include <xlnt/utils/xlntarrow.hpp>
@ -52,6 +53,8 @@ std::unique_ptr<arrow::ArrayBuilder> make_array_builder(xlnt::cell::type type)
case xlnt::cell::type::date: case xlnt::cell::type::date:
return std::unique_ptr<arrow::Date32Builder>(new arrow::Date32Builder(arrow::default_memory_pool())); return std::unique_ptr<arrow::Date32Builder>(new arrow::Date32Builder(arrow::default_memory_pool()));
} }
default_case(std::unique_ptr<arrow::ArrayBuilder>(nullptr));
} }
arrow::Field make_type_field(const std::string &name, xlnt::cell::type type) arrow::Field make_type_field(const std::string &name, xlnt::cell::type type)
@ -71,6 +74,8 @@ arrow::Field make_type_field(const std::string &name, xlnt::cell::type type)
case xlnt::cell::type::date: case xlnt::cell::type::date:
return arrow::Field(name, arrow::date32()); return arrow::Field(name, arrow::date32());
} }
default_case(arrow::Field("", arrow::null()));
} }
} // namespace } // namespace
@ -103,6 +108,7 @@ std::shared_ptr<arrow::Table> XLNT_API xlsx2arrow(std::istream &s)
if (cell.row() == 1) if (cell.row() == 1)
{ {
column_names.push_back(cell.value<std::string>()); column_names.push_back(cell.value<std::string>());
continue;
} }
else if (cell.row() == 2) else if (cell.row() == 2)
{ {
@ -111,6 +117,41 @@ std::shared_ptr<arrow::Table> XLNT_API xlsx2arrow(std::istream &s)
fields.push_back(std::make_shared<arrow::Field>(field)); fields.push_back(std::make_shared<arrow::Field>(field));
columns.push_back(make_array_builder(cell.data_type())); columns.push_back(make_array_builder(cell.data_type()));
} }
auto builder = columns.at(cell.column().index - 1).get();
switch (cell.data_type())
{
case xlnt::cell::type::number:
{
auto typed_builder = static_cast<arrow::DoubleBuilder*>(builder);
typed_builder->Append(0);
break;
}
case xlnt::cell::type::inline_string:
case xlnt::cell::type::shared_string:
case xlnt::cell::type::error:
case xlnt::cell::type::formula_string:
case xlnt::cell::type::empty:
{
auto typed_builder = static_cast<arrow::StringBuilder*>(builder);
typed_builder->Append(cell.value<std::string>());
break;
}
case xlnt::cell::type::boolean:
{
auto typed_builder = static_cast<arrow::BooleanBuilder*>(builder);
typed_builder->Append(cell.value<bool>());
break;
}
case xlnt::cell::type::date:
{
auto typed_builder = static_cast<arrow::Date32Builder*>(builder);
typed_builder->Append(cell.value<int>());
break;
}
}
} }
reader.end_worksheet(); reader.end_worksheet();
@ -131,7 +172,7 @@ std::shared_ptr<arrow::Table> XLNT_API xlsx2arrow(std::istream &s)
return table; return table;
} }
void XLNT_API arrow2xlsx(std::shared_ptr<const arrow::Table> &table, std::ostream &s) void XLNT_API arrow2xlsx(std::shared_ptr<arrow::Table> &table, std::ostream &s)
{ {
xlnt::streaming_workbook_writer writer; xlnt::streaming_workbook_writer writer;
writer.open(s); writer.open(s);

View File

@ -42,6 +42,7 @@ xlntpyarrow_extension = Extension(
include_dirs = include_dirs, include_dirs = include_dirs,
libraries = [ libraries = [
'arrow', 'arrow',
'arrow_python',
'xlnt' 'xlnt'
], ],
library_dirs = library_dirs, library_dirs = library_dirs,

View File

@ -3,17 +3,65 @@
#include <vector> #include <vector>
#include <arrow/api.h> #include <arrow/api.h>
#include <arrow/python/pyarrow.h>
#include <Python.h> // must be included after Arrow #include <Python.h> // must be included after Arrow
#include <python_streambuf.hpp> #include <python_streambuf.hpp>
#include <xlnt/utils/xlntarrow.hpp> #include <xlnt/utils/xlntarrow.hpp>
PyObject *xlsx2arrow(PyObject *file) bool import_pyarrow()
{ {
xlnt::python_streambuf buffer(file); static bool imported = false;
if (!imported)
{
if (!arrow::py::import_pyarrow())
{
if (PyErr_Occurred() != nullptr)
{
PyErr_Print();
PyErr_Clear();
}
}
else
{
imported = true;
}
}
return imported;
}
PyObject *xlsx2arrow(PyObject *pyfile)
{
if (!import_pyarrow())
{
Py_RETURN_NONE;
}
xlnt::python_streambuf buffer(pyfile);
std::istream stream(&buffer); std::istream stream(&buffer);
auto table = xlnt::xlsx2arrow(stream); auto table = xlnt::xlsx2arrow(stream);
return arrow::py::wrap_table(table);
}
PyObject *arrow2xlsx(PyObject *pytable, PyObject *pyfile)
{
if (!import_pyarrow())
{
Py_RETURN_NONE;
}
(void)pytable;
(void)pyfile;
/*
auto table = arrow::py::unwrap_table(pytable);
xlnt::python_streambuf buffer(pyfile);
std::ostream stream(&buffer);
xlnt::arrow2xlsx(table, stream);
*/
Py_RETURN_NONE; Py_RETURN_NONE;
} }
@ -28,10 +76,11 @@ Returns an arrow table representing the given XLSX file object.");
PyObject *xlntpyarrow_xlsx2arrow(PyObject *self, PyObject *args, PyObject *kwargs) PyObject *xlntpyarrow_xlsx2arrow(PyObject *self, PyObject *args, PyObject *kwargs)
{ {
PyObject *file = NULL;
static const char *keywords[] = { "file", NULL }; static const char *keywords[] = { "file", NULL };
static auto keywords_nc = const_cast<char **>(keywords); static auto keywords_nc = const_cast<char **>(keywords);
PyObject *file = NULL;
if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O", keywords_nc, &file)) if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O", keywords_nc, &file))
{ {
return NULL; return NULL;
@ -50,16 +99,18 @@ Writes the given arrow table to out_file as an XLSX file.");
PyObject *xlntpyarrow_arrow2xlsx(PyObject *self, PyObject *args, PyObject *kwargs) PyObject *xlntpyarrow_arrow2xlsx(PyObject *self, PyObject *args, PyObject *kwargs)
{ {
PyObject *obj = NULL; static const char *keywords[] = { "table", "file", NULL };
static const char *keywords[] = { "file", NULL };
static auto keywords_nc = const_cast<char **>(keywords); static auto keywords_nc = const_cast<char **>(keywords);
if (!PyArg_ParseTupleAndKeywords(args, kwargs, "Oi", keywords_nc, &obj)) PyObject *table = NULL;
PyObject *file = NULL;
if (!PyArg_ParseTupleAndKeywords(args, kwargs, "OO", keywords_nc, &table, &file))
{ {
return NULL; return NULL;
} }
Py_RETURN_NONE; return arrow2xlsx(table, file);
} }
// 2.7/3 compatible based on https://docs.python.org/3/howto/cporting.html // 2.7/3 compatible based on https://docs.python.org/3/howto/cporting.html