figured out the problem

This commit is contained in:
Thomas Fussell 2017-07-30 20:32:37 -07:00
parent de0e010056
commit 8801a0e352
5 changed files with 41 additions and 35 deletions

View File

@ -118,6 +118,12 @@ public:
/// </summary> /// </summary>
void open(std::istream &stream); void open(std::istream &stream);
/// <summary>
/// Holds the given streambuf internally, creates a std::istream backed
/// by the given buffer, and calls open(std::istream &) with that stream.
/// </summary>
void open(std::unique_ptr<std::streambuf> &&buffer);
/// <summary> /// <summary>
/// Returns a vector of the titles of sheets in the workbook in order. /// Returns a vector of the titles of sheets in the workbook in order.
/// </summary> /// </summary>

View File

@ -158,6 +158,13 @@ void streaming_workbook_reader::open(std::istream &stream)
const auto workbook_path = workbook_rel.target().path(); const auto workbook_path = workbook_rel.target().path();
} }
void streaming_workbook_reader::open(std::unique_ptr<std::streambuf> &&buffer)
{
stream_buffer_.swap(buffer);
stream_.reset(new std::istream(stream_buffer_.get()));
open(*stream_);
}
std::vector<std::string> streaming_workbook_reader::sheet_titles() std::vector<std::string> streaming_workbook_reader::sheet_titles()
{ {
return workbook_->sheet_titles(); return workbook_->sheet_titles();

View File

@ -102,7 +102,6 @@ class python_streambuf : public std::basic_streambuf<char>
member function readsome to work correctly (c.f. 27.6.1.3, alinea 30) member function readsome to work correctly (c.f. 27.6.1.3, alinea 30)
*/ */
virtual std::streamsize showmanyc() { virtual std::streamsize showmanyc() {
std::cout << "showmanyc" << std::endl;
int_type const failure = traits_type::eof(); int_type const failure = traits_type::eof();
int_type status = underflow(); int_type status = underflow();
if (status == failure) return -1; if (status == failure) return -1;
@ -111,7 +110,6 @@ class python_streambuf : public std::basic_streambuf<char>
/// C.f. C++ standard section 27.5.2.4.3 /// C.f. C++ standard section 27.5.2.4.3
virtual int_type underflow() { virtual int_type underflow() {
std::cout << "underflow" << std::endl;
int_type const failure = traits_type::eof(); int_type const failure = traits_type::eof();
if (py_read.is_none()) { if (py_read.is_none()) {
throw std::invalid_argument( throw std::invalid_argument(
@ -136,7 +134,6 @@ class python_streambuf : public std::basic_streambuf<char>
/// C.f. C++ standard section 27.5.2.4.5 /// C.f. C++ standard section 27.5.2.4.5
virtual int_type overflow(int_type c=traits_type_eof()) { virtual int_type overflow(int_type c=traits_type_eof()) {
std::cout << "overflow" << std::endl;
if (py_write.is_none()) { if (py_write.is_none()) {
throw std::invalid_argument( throw std::invalid_argument(
"That Python file object has no 'write' attribute"); "That Python file object has no 'write' attribute");
@ -168,7 +165,6 @@ class python_streambuf : public std::basic_streambuf<char>
seek position in that read buffer. seek position in that read buffer.
*/ */
virtual int sync() { virtual int sync() {
std::cout << "sync" << std::endl;
int result = 0; int result = 0;
farthest_pptr = std::max(farthest_pptr, pptr()); farthest_pptr = std::max(farthest_pptr, pptr());
if (farthest_pptr && farthest_pptr > pbase()) { if (farthest_pptr && farthest_pptr > pbase()) {
@ -201,7 +197,6 @@ class python_streambuf : public std::basic_streambuf<char>
std::ios_base::openmode which= std::ios_base::in std::ios_base::openmode which= std::ios_base::in
| std::ios_base::out) | std::ios_base::out)
{ {
std::cout << "seekoff" << std::endl;
/* In practice, "which" is either std::ios_base::in or out /* In practice, "which" is either std::ios_base::in or out
since we end up here because either seekp or seekg was called since we end up here because either seekp or seekg was called
on the stream using this buffer. That simplifies the code on the stream using this buffer. That simplifies the code
@ -259,7 +254,6 @@ class python_streambuf : public std::basic_streambuf<char>
std::ios_base::openmode which= std::ios_base::in std::ios_base::openmode which= std::ios_base::in
| std::ios_base::out) | std::ios_base::out)
{ {
std::cout << "seekpos" << std::endl;
return python_streambuf::seekoff(sp, std::ios_base::beg, which); return python_streambuf::seekoff(sp, std::ios_base::beg, which);
} }

View File

@ -37,24 +37,27 @@ def xlsx2arrow(io, sheetname):
column_names = [] column_names = []
fields = [] fields = []
batches = [] batches = []
schema = None
while reader.has_cell(): while reader.has_cell():
print('read_cell')
cell = reader.read_cell() cell = reader.read_cell()
type = cell.data_type() type = cell.data_type()
print('read_cell', cell.row(), cell.column())
if cell.row() == 1: if cell.row() == 1:
column_names.push_back(cell.value_string()) column_names.append(cell.value_string())
continue continue
elif cell.row() == 2: elif cell.row() == 2:
column_name = column_names[cell.column() - 1] column_name = column_names[cell.column() - 1]
fields.append(pa.Field(column_name, COLUMN_TYPE_FIELD[type]())) fields.append(pa.field(column_name, COLUMN_TYPE_FIELD[type]()))
continue continue
elif schema is None: elif schema is None:
schema = pa.schema(fields) schema = pa.schema(fields)
batch = xpa.read_batch(schema, 0) print(schema)
print(batch)
batch = reader.read_batch(schema, 100000)
batches.append(batch) batches.append(batch)
break break
@ -65,4 +68,5 @@ def xlsx2arrow(io, sheetname):
if __name__ == '__main__': if __name__ == '__main__':
file = open('tmp.xlsx', 'rb') file = open('tmp.xlsx', 'rb')
print(xlsx2arrow(file, 'Sheet1')) table = xlsx2arrow(file, 'Sheet1')
print(table.to_pandas())

View File

@ -154,9 +154,7 @@ std::unique_ptr<arrow::ArrayBuilder> make_array_builder(arrow::Type::type type)
void open_file(xlnt::streaming_workbook_reader &reader, pybind11::object file) void open_file(xlnt::streaming_workbook_reader &reader, pybind11::object file)
{ {
xlnt::python_streambuf buffer(file); reader.open(std::unique_ptr<std::streambuf>(new xlnt::python_streambuf(file)));
std::istream stream(&buffer);
reader.open(stream);
} }
pybind11::handle read_batch(xlnt::streaming_workbook_reader &reader, pybind11::handle read_batch(xlnt::streaming_workbook_reader &reader,
@ -167,36 +165,29 @@ pybind11::handle read_batch(xlnt::streaming_workbook_reader &reader,
std::shared_ptr<arrow::Schema> schema; std::shared_ptr<arrow::Schema> schema;
arrow::py::unwrap_schema(pyschema.ptr(), &schema); arrow::py::unwrap_schema(pyschema.ptr(), &schema);
std::cout << "1" << std::endl;
auto column_types = extract_schema_types(schema); auto column_types = extract_schema_types(schema);
auto builders = std::vector<std::shared_ptr<arrow::ArrayBuilder>>(); auto builders = std::vector<std::shared_ptr<arrow::ArrayBuilder>>();
auto num_rows = std::int64_t(0); auto num_rows = std::int64_t(0);
std::cout << "2" << std::endl;
for (auto type : column_types) for (auto type : column_types)
{ {
builders.push_back(make_array_builder(type)); builders.push_back(make_array_builder(type));
} }
std::cout << "3" << std::endl;
for (auto row = 0; row < max_rows; ++row) for (auto row = 0; row < max_rows; ++row)
{ {
if (!reader.has_cell()) break; if (!reader.has_cell()) break;
std::cout << "4" << std::endl; if (row % 1000 == 0)
{
std::cout << row << std::endl;
}
for (auto column = 0; column < schema->num_fields(); ++column) for (auto column = 0; column < schema->num_fields(); ++column)
{ {
if (!reader.has_cell()) break; if (!reader.has_cell()) break;
std::cout << "5" << std::endl;
auto cell = reader.read_cell(); auto cell = reader.read_cell();
/*
auto column_type = column_types.at(column); auto column_type = column_types.at(column);
auto builder = builders.at(cell.column().index - 1).get(); auto builder = builders.at(cell.column().index - 1).get();
@ -287,14 +278,11 @@ pybind11::handle read_batch(xlnt::streaming_workbook_reader &reader,
case arrow::Type::DICTIONARY: case arrow::Type::DICTIONARY:
break; break;
} }
*/
} }
++num_rows; ++num_rows;
} }
std::cout << "6" << std::endl;
auto columns = std::vector<std::shared_ptr<arrow::Array>>(); auto columns = std::vector<std::shared_ptr<arrow::Array>>();
for (auto &builder : builders) for (auto &builder : builders)
@ -304,14 +292,10 @@ pybind11::handle read_batch(xlnt::streaming_workbook_reader &reader,
columns.emplace_back(column); columns.emplace_back(column);
} }
std::cout << "7" << std::endl;
auto batch_pointer = std::make_shared<arrow::RecordBatch>(schema, num_rows, columns); auto batch_pointer = std::make_shared<arrow::RecordBatch>(schema, num_rows, columns);
auto batch_object = arrow::py::wrap_record_batch(batch_pointer); auto batch_object = arrow::py::wrap_record_batch(batch_pointer);
auto batch_handle = pybind11::handle(batch_object); // don't need to incr. reference count, right? auto batch_handle = pybind11::handle(batch_object); // don't need to incr. reference count, right?
std::cout << "8" << std::endl;
return batch_handle; return batch_handle;
} }
@ -330,10 +314,21 @@ PYBIND11_MODULE(xlntpyarrow, m)
.def("open", &open_file) .def("open", &open_file)
.def("read_batch", &read_batch); .def("read_batch", &read_batch);
pybind11::class_<xlnt::worksheet>(m, "Worksheet");
pybind11::class_<xlnt::cell> cell(m, "Cell"); pybind11::class_<xlnt::cell> cell(m, "Cell");
cell.def("value_string", [](xlnt::cell cell) cell.def("value_string", [](xlnt::cell &cell)
{ {
return cell.value<std::string>(); return cell.value<std::string>();
})
.def("data_type", [](xlnt::cell &cell)
{
return cell.data_type();
})
.def("row", &xlnt::cell::row)
.def("column", [](xlnt::cell &cell)
{
return cell.column().index;
}); });
pybind11::enum_<xlnt::cell::type>(cell, "Type") pybind11::enum_<xlnt::cell::type>(cell, "Type")