figured out the problem

This commit is contained in:
Thomas Fussell 2017-07-30 20:32:37 -07:00
parent de0e010056
commit 8801a0e352
5 changed files with 41 additions and 35 deletions

View File

@ -118,6 +118,12 @@ public:
/// </summary>
void open(std::istream &stream);
/// <summary>
/// Holds the given streambuf internally, creates a std::istream backed
/// by the given buffer, and calls open(std::istream &) with that stream.
/// </summary>
void open(std::unique_ptr<std::streambuf> &&buffer);
/// <summary>
/// Returns a vector of the titles of sheets in the workbook in order.
/// </summary>

View File

@ -158,6 +158,13 @@ void streaming_workbook_reader::open(std::istream &stream)
const auto workbook_path = workbook_rel.target().path();
}
void streaming_workbook_reader::open(std::unique_ptr<std::streambuf> &&buffer)
{
stream_buffer_.swap(buffer);
stream_.reset(new std::istream(stream_buffer_.get()));
open(*stream_);
}
std::vector<std::string> streaming_workbook_reader::sheet_titles()
{
return workbook_->sheet_titles();

View File

@ -102,7 +102,6 @@ class python_streambuf : public std::basic_streambuf<char>
member function readsome to work correctly (c.f. 27.6.1.3, alinea 30)
*/
virtual std::streamsize showmanyc() {
std::cout << "showmanyc" << std::endl;
int_type const failure = traits_type::eof();
int_type status = underflow();
if (status == failure) return -1;
@ -111,7 +110,6 @@ class python_streambuf : public std::basic_streambuf<char>
/// C.f. C++ standard section 27.5.2.4.3
virtual int_type underflow() {
std::cout << "underflow" << std::endl;
int_type const failure = traits_type::eof();
if (py_read.is_none()) {
throw std::invalid_argument(
@ -136,7 +134,6 @@ class python_streambuf : public std::basic_streambuf<char>
/// C.f. C++ standard section 27.5.2.4.5
virtual int_type overflow(int_type c=traits_type_eof()) {
std::cout << "overflow" << std::endl;
if (py_write.is_none()) {
throw std::invalid_argument(
"That Python file object has no 'write' attribute");
@ -168,7 +165,6 @@ class python_streambuf : public std::basic_streambuf<char>
seek position in that read buffer.
*/
virtual int sync() {
std::cout << "sync" << std::endl;
int result = 0;
farthest_pptr = std::max(farthest_pptr, pptr());
if (farthest_pptr && farthest_pptr > pbase()) {
@ -201,7 +197,6 @@ class python_streambuf : public std::basic_streambuf<char>
std::ios_base::openmode which= std::ios_base::in
| std::ios_base::out)
{
std::cout << "seekoff" << std::endl;
/* In practice, "which" is either std::ios_base::in or out
since we end up here because either seekp or seekg was called
on the stream using this buffer. That simplifies the code
@ -259,7 +254,6 @@ class python_streambuf : public std::basic_streambuf<char>
std::ios_base::openmode which= std::ios_base::in
| std::ios_base::out)
{
std::cout << "seekpos" << std::endl;
return python_streambuf::seekoff(sp, std::ios_base::beg, which);
}

View File

@ -37,24 +37,27 @@ def xlsx2arrow(io, sheetname):
column_names = []
fields = []
batches = []
schema = None
while reader.has_cell():
print('read_cell')
cell = reader.read_cell()
type = cell.data_type()
print('read_cell', cell.row(), cell.column())
if cell.row() == 1:
column_names.push_back(cell.value_string())
column_names.append(cell.value_string())
continue
elif cell.row() == 2:
column_name = column_names[cell.column() - 1]
fields.append(pa.Field(column_name, COLUMN_TYPE_FIELD[type]()))
fields.append(pa.field(column_name, COLUMN_TYPE_FIELD[type]()))
continue
elif schema is None:
schema = pa.schema(fields)
batch = xpa.read_batch(schema, 0)
print(batch)
print(schema)
batch = reader.read_batch(schema, 100000)
batches.append(batch)
break
@ -65,4 +68,5 @@ def xlsx2arrow(io, sheetname):
if __name__ == '__main__':
file = open('tmp.xlsx', 'rb')
print(xlsx2arrow(file, 'Sheet1'))
table = xlsx2arrow(file, 'Sheet1')
print(table.to_pandas())

View File

@ -154,9 +154,7 @@ std::unique_ptr<arrow::ArrayBuilder> make_array_builder(arrow::Type::type type)
void open_file(xlnt::streaming_workbook_reader &reader, pybind11::object file)
{
xlnt::python_streambuf buffer(file);
std::istream stream(&buffer);
reader.open(stream);
reader.open(std::unique_ptr<std::streambuf>(new xlnt::python_streambuf(file)));
}
pybind11::handle read_batch(xlnt::streaming_workbook_reader &reader,
@ -167,36 +165,29 @@ pybind11::handle read_batch(xlnt::streaming_workbook_reader &reader,
std::shared_ptr<arrow::Schema> schema;
arrow::py::unwrap_schema(pyschema.ptr(), &schema);
std::cout << "1" << std::endl;
auto column_types = extract_schema_types(schema);
auto builders = std::vector<std::shared_ptr<arrow::ArrayBuilder>>();
auto num_rows = std::int64_t(0);
std::cout << "2" << std::endl;
for (auto type : column_types)
{
builders.push_back(make_array_builder(type));
}
std::cout << "3" << std::endl;
for (auto row = 0; row < max_rows; ++row)
{
if (!reader.has_cell()) break;
std::cout << "4" << std::endl;
if (row % 1000 == 0)
{
std::cout << row << std::endl;
}
for (auto column = 0; column < schema->num_fields(); ++column)
{
if (!reader.has_cell()) break;
std::cout << "5" << std::endl;
auto cell = reader.read_cell();
/*
auto column_type = column_types.at(column);
auto builder = builders.at(cell.column().index - 1).get();
@ -287,14 +278,11 @@ pybind11::handle read_batch(xlnt::streaming_workbook_reader &reader,
case arrow::Type::DICTIONARY:
break;
}
*/
}
++num_rows;
}
std::cout << "6" << std::endl;
auto columns = std::vector<std::shared_ptr<arrow::Array>>();
for (auto &builder : builders)
@ -304,14 +292,10 @@ pybind11::handle read_batch(xlnt::streaming_workbook_reader &reader,
columns.emplace_back(column);
}
std::cout << "7" << std::endl;
auto batch_pointer = std::make_shared<arrow::RecordBatch>(schema, num_rows, columns);
auto batch_object = arrow::py::wrap_record_batch(batch_pointer);
auto batch_handle = pybind11::handle(batch_object); // don't need to incr. reference count, right?
std::cout << "8" << std::endl;
return batch_handle;
}
@ -330,10 +314,21 @@ PYBIND11_MODULE(xlntpyarrow, m)
.def("open", &open_file)
.def("read_batch", &read_batch);
pybind11::class_<xlnt::worksheet>(m, "Worksheet");
pybind11::class_<xlnt::cell> cell(m, "Cell");
cell.def("value_string", [](xlnt::cell cell)
cell.def("value_string", [](xlnt::cell &cell)
{
return cell.value<std::string>();
})
.def("data_type", [](xlnt::cell &cell)
{
return cell.data_type();
})
.def("row", &xlnt::cell::row)
.def("column", [](xlnt::cell &cell)
{
return cell.column().index;
});
pybind11::enum_<xlnt::cell::type>(cell, "Type")